Spaces:
Sleeping
Sleeping
fix: Update citation patterns to match new SWOT output format
Browse filesThe SWOT output format changed from VALUE [M##] to [M##] Metric: Value
but the citation validator patterns weren't updated, causing:
- 0% citation coverage (pattern didn't match)
- 25+ "uncited" values flagged (all values appeared uncited)
- Score capped at 4.5/10
Changes:
- Added CITATION_PATTERN_NEW for [M##] Metric: Value format
- Kept CITATION_PATTERN_OLD for backwards compatibility
- Added CITATION_REF_PATTERN for simple [M##] counting
- Updated extract_citations() to try both patterns
- Updated find_uncited_numbers() to exclude both patterns
- Updated get_citation_count() to use ref pattern
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- src/utils/numeric_validator.py +52 -14
src/utils/numeric_validator.py
CHANGED
|
@@ -9,12 +9,22 @@ import re
|
|
| 9 |
from typing import Optional
|
| 10 |
|
| 11 |
|
| 12 |
-
# Pattern to match citations
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
r'([\d,$\.]+[BMK%]?)\s*\[M(\d{2})\]',
|
| 15 |
re.IGNORECASE
|
| 16 |
)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def normalize_value(text: str) -> Optional[float]:
|
| 20 |
"""
|
|
@@ -97,6 +107,10 @@ def extract_citations(text: str) -> list[dict]:
|
|
| 97 |
"""
|
| 98 |
Extract all [M##] citations from text.
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
Returns list of dicts:
|
| 101 |
[
|
| 102 |
{"ref_id": "M01", "cited_value": "$394.3B", "normalized": 394300000000.0},
|
|
@@ -104,16 +118,36 @@ def extract_citations(text: str) -> list[dict]:
|
|
| 104 |
]
|
| 105 |
"""
|
| 106 |
citations = []
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
cited_value = match.group(1)
|
| 109 |
ref_num = match.group(2)
|
| 110 |
ref_id = f"M{ref_num}"
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
| 117 |
return citations
|
| 118 |
|
| 119 |
|
|
@@ -262,11 +296,15 @@ def find_uncited_numbers(swot_text: str, metric_reference: dict) -> list[dict]:
|
|
| 262 |
"""
|
| 263 |
uncited = []
|
| 264 |
|
| 265 |
-
# Get all cited positions to exclude
|
| 266 |
-
cited_matches = list(CITATION_PATTERN.finditer(swot_text))
|
| 267 |
cited_positions = set()
|
| 268 |
-
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
cited_positions.update(range(match.start(), match.end()))
|
| 271 |
|
| 272 |
# Find all metric-like numbers
|
|
@@ -344,7 +382,7 @@ def validate_uncited_numbers(swot_text: str, metric_reference: dict) -> list[str
|
|
| 344 |
|
| 345 |
def get_citation_count(swot_text: str) -> int:
|
| 346 |
"""Count the number of [M##] citations in the text."""
|
| 347 |
-
return len(
|
| 348 |
|
| 349 |
|
| 350 |
def validate_minimum_citations(swot_text: str, metric_reference: dict, min_ratio: float = 0.5) -> dict:
|
|
|
|
| 9 |
from typing import Optional
|
| 10 |
|
| 11 |
|
| 12 |
+
# Pattern to match citations in NEW format: [M01] Revenue: $394.3B - insight
|
| 13 |
+
# Matches: [M##] followed by metric name, colon, and value
|
| 14 |
+
CITATION_PATTERN_NEW = re.compile(
|
| 15 |
+
r'\[M(\d{2})\]\s*[^:]+:\s*(\$?[\d,]+\.?\d*[BMKTx%]?)',
|
| 16 |
+
re.IGNORECASE
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Pattern to match citations in OLD format: $394.3B [M01] (kept for backwards compatibility)
|
| 20 |
+
CITATION_PATTERN_OLD = re.compile(
|
| 21 |
r'([\d,$\.]+[BMK%]?)\s*\[M(\d{2})\]',
|
| 22 |
re.IGNORECASE
|
| 23 |
)
|
| 24 |
|
| 25 |
+
# Combined pattern to find any [M##] reference (for citation counting)
|
| 26 |
+
CITATION_REF_PATTERN = re.compile(r'\[M(\d{2})\]', re.IGNORECASE)
|
| 27 |
+
|
| 28 |
|
| 29 |
def normalize_value(text: str) -> Optional[float]:
|
| 30 |
"""
|
|
|
|
| 107 |
"""
|
| 108 |
Extract all [M##] citations from text.
|
| 109 |
|
| 110 |
+
Supports both formats:
|
| 111 |
+
- NEW: [M01] Revenue: $394.3B - insight
|
| 112 |
+
- OLD: $394.3B [M01]
|
| 113 |
+
|
| 114 |
Returns list of dicts:
|
| 115 |
[
|
| 116 |
{"ref_id": "M01", "cited_value": "$394.3B", "normalized": 394300000000.0},
|
|
|
|
| 118 |
]
|
| 119 |
"""
|
| 120 |
citations = []
|
| 121 |
+
seen_refs = set()
|
| 122 |
+
|
| 123 |
+
# Try NEW format first: [M##] Metric: Value
|
| 124 |
+
for match in CITATION_PATTERN_NEW.finditer(text):
|
| 125 |
+
ref_num = match.group(1)
|
| 126 |
+
cited_value = match.group(2)
|
| 127 |
+
ref_id = f"M{ref_num}"
|
| 128 |
+
if ref_id not in seen_refs:
|
| 129 |
+
normalized = normalize_value(cited_value)
|
| 130 |
+
citations.append({
|
| 131 |
+
"ref_id": ref_id,
|
| 132 |
+
"cited_value": cited_value,
|
| 133 |
+
"normalized": normalized
|
| 134 |
+
})
|
| 135 |
+
seen_refs.add(ref_id)
|
| 136 |
+
|
| 137 |
+
# Also try OLD format: Value [M##]
|
| 138 |
+
for match in CITATION_PATTERN_OLD.finditer(text):
|
| 139 |
cited_value = match.group(1)
|
| 140 |
ref_num = match.group(2)
|
| 141 |
ref_id = f"M{ref_num}"
|
| 142 |
+
if ref_id not in seen_refs:
|
| 143 |
+
normalized = normalize_value(cited_value)
|
| 144 |
+
citations.append({
|
| 145 |
+
"ref_id": ref_id,
|
| 146 |
+
"cited_value": cited_value,
|
| 147 |
+
"normalized": normalized
|
| 148 |
+
})
|
| 149 |
+
seen_refs.add(ref_id)
|
| 150 |
+
|
| 151 |
return citations
|
| 152 |
|
| 153 |
|
|
|
|
| 296 |
"""
|
| 297 |
uncited = []
|
| 298 |
|
| 299 |
+
# Get all cited positions to exclude (check both NEW and OLD patterns)
|
|
|
|
| 300 |
cited_positions = set()
|
| 301 |
+
|
| 302 |
+
# NEW format: [M##] Metric: Value
|
| 303 |
+
for match in CITATION_PATTERN_NEW.finditer(swot_text):
|
| 304 |
+
cited_positions.update(range(match.start(), match.end()))
|
| 305 |
+
|
| 306 |
+
# OLD format: Value [M##]
|
| 307 |
+
for match in CITATION_PATTERN_OLD.finditer(swot_text):
|
| 308 |
cited_positions.update(range(match.start(), match.end()))
|
| 309 |
|
| 310 |
# Find all metric-like numbers
|
|
|
|
| 382 |
|
| 383 |
def get_citation_count(swot_text: str) -> int:
|
| 384 |
"""Count the number of [M##] citations in the text."""
|
| 385 |
+
return len(CITATION_REF_PATTERN.findall(swot_text))
|
| 386 |
|
| 387 |
|
| 388 |
def validate_minimum_citations(swot_text: str, metric_reference: dict, min_ratio: float = 0.5) -> dict:
|