Spaces:

vn6295337
/

Instant-SWOT-Agent

Sleeping

vn6295337 Claude Opus 4.5 commited on Jan 13

Commit

e3b7287

1 Parent(s): 6fd7fcd

fix: Update citation patterns to match new SWOT output format

The SWOT output format changed from VALUE [M##] to [M##] Metric: Value
but the citation validator patterns weren't updated, causing:
- 0% citation coverage (pattern didn't match)
- 25+ "uncited" values flagged (all values appeared uncited)
- Score capped at 4.5/10

Changes:
- Added CITATION_PATTERN_NEW for [M##] Metric: Value format
- Kept CITATION_PATTERN_OLD for backwards compatibility
- Added CITATION_REF_PATTERN for simple [M##] counting
- Updated extract_citations() to try both patterns
- Updated find_uncited_numbers() to exclude both patterns
- Updated get_citation_count() to use ref pattern

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

src/utils/numeric_validator.py +52 -14

src/utils/numeric_validator.py CHANGED Viewed

@@ -9,12 +9,22 @@ import re
 from typing import Optional
-# Pattern to match citations like: $394.3B [M01], 25.3% [M02], 32.5 [M04]
-CITATION_PATTERN = re.compile(
     r'([\d,$\.]+[BMK%]?)\s*\[M(\d{2})\]',
     re.IGNORECASE
 )
 def normalize_value(text: str) -> Optional[float]:
     """
@@ -97,6 +107,10 @@ def extract_citations(text: str) -> list[dict]:
     """
     Extract all [M##] citations from text.
     Returns list of dicts:
     [
         {"ref_id": "M01", "cited_value": "$394.3B", "normalized": 394300000000.0},
@@ -104,16 +118,36 @@ def extract_citations(text: str) -> list[dict]:
     ]
     """
     citations = []
-    for match in CITATION_PATTERN.finditer(text):
         cited_value = match.group(1)
         ref_num = match.group(2)
         ref_id = f"M{ref_num}"
-        normalized = normalize_value(cited_value)
-        citations.append({
-            "ref_id": ref_id,
-            "cited_value": cited_value,
-            "normalized": normalized
-        })
     return citations
@@ -262,11 +296,15 @@ def find_uncited_numbers(swot_text: str, metric_reference: dict) -> list[dict]:
     """
     uncited = []
-    # Get all cited positions to exclude
-    cited_matches = list(CITATION_PATTERN.finditer(swot_text))
     cited_positions = set()
-    for match in cited_matches:
-        # Mark the entire citation span as "cited"
         cited_positions.update(range(match.start(), match.end()))
     # Find all metric-like numbers
@@ -344,7 +382,7 @@ def validate_uncited_numbers(swot_text: str, metric_reference: dict) -> list[str
 def get_citation_count(swot_text: str) -> int:
     """Count the number of [M##] citations in the text."""
-    return len(CITATION_PATTERN.findall(swot_text))
 def validate_minimum_citations(swot_text: str, metric_reference: dict, min_ratio: float = 0.5) -> dict:

 from typing import Optional
+# Pattern to match citations in NEW format: [M01] Revenue: $394.3B - insight
+# Matches: [M##] followed by metric name, colon, and value
+CITATION_PATTERN_NEW = re.compile(
+    r'\[M(\d{2})\]\s*[^:]+:\s*(\$?[\d,]+\.?\d*[BMKTx%]?)',
+    re.IGNORECASE
+)
+# Pattern to match citations in OLD format: $394.3B [M01] (kept for backwards compatibility)
+CITATION_PATTERN_OLD = re.compile(
     r'([\d,$\.]+[BMK%]?)\s*\[M(\d{2})\]',
     re.IGNORECASE
 )
+# Combined pattern to find any [M##] reference (for citation counting)
+CITATION_REF_PATTERN = re.compile(r'\[M(\d{2})\]', re.IGNORECASE)
 def normalize_value(text: str) -> Optional[float]:
     """
     """
     Extract all [M##] citations from text.
+    Supports both formats:
+    - NEW: [M01] Revenue: $394.3B - insight
+    - OLD: $394.3B [M01]
     Returns list of dicts:
     [
         {"ref_id": "M01", "cited_value": "$394.3B", "normalized": 394300000000.0},
     ]
     """
     citations = []
+    seen_refs = set()
+    # Try NEW format first: [M##] Metric: Value
+    for match in CITATION_PATTERN_NEW.finditer(text):
+        ref_num = match.group(1)
+        cited_value = match.group(2)
+        ref_id = f"M{ref_num}"
+        if ref_id not in seen_refs:
+            normalized = normalize_value(cited_value)
+            citations.append({
+                "ref_id": ref_id,
+                "cited_value": cited_value,
+                "normalized": normalized
+            })
+            seen_refs.add(ref_id)
+    # Also try OLD format: Value [M##]
+    for match in CITATION_PATTERN_OLD.finditer(text):
         cited_value = match.group(1)
         ref_num = match.group(2)
         ref_id = f"M{ref_num}"
+        if ref_id not in seen_refs:
+            normalized = normalize_value(cited_value)
+            citations.append({
+                "ref_id": ref_id,
+                "cited_value": cited_value,
+                "normalized": normalized
+            })
+            seen_refs.add(ref_id)
     return citations
     """
     uncited = []
+    # Get all cited positions to exclude (check both NEW and OLD patterns)
     cited_positions = set()
+    # NEW format: [M##] Metric: Value
+    for match in CITATION_PATTERN_NEW.finditer(swot_text):
+        cited_positions.update(range(match.start(), match.end()))
+    # OLD format: Value [M##]
+    for match in CITATION_PATTERN_OLD.finditer(swot_text):
         cited_positions.update(range(match.start(), match.end()))
     # Find all metric-like numbers
 def get_citation_count(swot_text: str) -> int:
     """Count the number of [M##] citations in the text."""
+    return len(CITATION_REF_PATTERN.findall(swot_text))
 def validate_minimum_citations(swot_text: str, metric_reference: dict, min_ratio: float = 0.5) -> dict: