| """ |
| extractor.py β The Core Extraction Engine |
| |
| It takes raw claim text and extracts three things: |
| 1. METRIC β what is being claimed about (via metrics.py) |
| 2. VALUE β the numeric value being claimed |
| 3. YEAR β what year the claim refers to |
| |
| Example: |
| Input: "India's GDP growth rate was 7.5% in 2024" |
| Output: { |
| "metric": "GDP growth rate", |
| "value": 7.5, |
| "year": 2024, |
| "confidence": 0.9 |
| } |
| """ |
|
|
| import re |
| import html |
| import unicodedata |
| from metrics import find_metric, PERCENTAGE_METRICS |
|
|
| |
| |
| |
| _RE_YEAR = re.compile(r"\b((?:19|20)\d{2})\b") |
| |
| _RE_FY_FULL = re.compile(r"\bFY\s?(\d{4})-(\d{2})\b", re.IGNORECASE) |
| _RE_FY_SHORT = re.compile(r"\b(\d{4})-(\d{2})\b") |
|
|
| _RE_PERCENT_VAL = re.compile(r"(-?\d+(?:,\d+)*(?:\.\d+)?)\s*(?:%|percent|per\s*cent)", re.IGNORECASE) |
| _RE_NUMBER = re.compile(r"(?<!\d)(-?\d+(?:,\d+)*(?:\.\d+)?)(?!\d)") |
| _RE_YEAR_LIKE = re.compile(r"^(?:19|20)\d{2}$") |
| _RE_PERCENT_NEAR = re.compile(r"\d\s*(?:%|percent|per\s*cent)", re.IGNORECASE) |
|
|
| |
| |
| |
| _WORD_MULTIPLIERS = [ |
| (re.compile("(?:\u20b9|rs\\.?\\s*)?(\\d+(?:\\.\\d+)?)\\s*lakh\\s*crore", re.IGNORECASE), 1e12), |
| (re.compile("(?:\u20b9|rs\\.?\\s*)?(\\d+(?:\\.\\d+)?)\\s*lakh", re.IGNORECASE), 1e5), |
| (re.compile("(?:\u20b9|rs\\.?\\s*)?(\\d+(?:\\.\\d+)?)\\s*crore", re.IGNORECASE), 1e7), |
| (re.compile(r"(\d+(?:\.\d+)?)\s*trillion", re.IGNORECASE), 1e12), |
| (re.compile(r"(\d+(?:\.\d+)?)\s*billion", re.IGNORECASE), 1e9), |
| (re.compile(r"(\d+(?:\.\d+)?)\s*million", re.IGNORECASE), 1e6), |
| (re.compile(r"(\d+(?:\.\d+)?)\s*thousand", re.IGNORECASE), 1e3), |
| ] |
|
|
| |
| |
| _COUNTRY_PATTERNS = [ |
| (re.compile(r"\bindia(?:'s)?\b", re.IGNORECASE), "IND"), |
| (re.compile(r"\bindian\b", re.IGNORECASE), "IND"), |
| (re.compile(r"\b(?:united\s+states|usa|u\.s\.a\.)\b", re.IGNORECASE), "USA"), |
| (re.compile(r"\bUS\b"), "USA"), |
| (re.compile(r"\bamerican?\b", re.IGNORECASE), "USA"), |
| (re.compile(r"\b(?:united\s+kingdom|uk|u\.k\.)\b", re.IGNORECASE), "GBR"), |
| (re.compile(r"\b(?:britain|british)\b", re.IGNORECASE), "GBR"), |
| (re.compile(r"\bchin(?:a|ese)\b", re.IGNORECASE), "CHN"), |
| (re.compile(r"\bjapan(?:ese)?\b", re.IGNORECASE), "JPN"), |
| (re.compile(r"\bgerman(?:y)?\b", re.IGNORECASE), "DEU"), |
| (re.compile(r"\bfran(?:ce|ch)\b", re.IGNORECASE), "FRA"), |
| (re.compile(r"\bbrazil(?:ian)?\b", re.IGNORECASE), "BRA"), |
| (re.compile(r"\bcanad(?:a|ian)\b", re.IGNORECASE), "CAN"), |
| (re.compile(r"\baustrali(?:a|an)\b", re.IGNORECASE), "AUS"), |
| (re.compile(r"\b(?:south\s+)?kor(?:ea|ean)\b", re.IGNORECASE), "KOR"), |
| ] |
|
|
|
|
| def preprocess_claim(text: str) -> str: |
| """ |
| Sanitize raw user input before extraction. |
| Handles HTML tags, whitespace noise, zero-width Unicode chars, and encoding. |
| |
| Steps: |
| 1. HTML-unescape β "&" β "&", "<" β "<" |
| 2. Strip HTML tags β <b>foo</b> β foo |
| 3. Remove zero-width / BOM chars (\u200b, \u200c, \u200d, \ufeff) |
| 4. NFC normalization β unify composed/decomposed Unicode forms |
| 5. Collapse whitespace β tabs, newlines, multiple spaces β single space |
| 6. Strip leading/trailing whitespace |
| """ |
| |
| text = html.unescape(text) |
|
|
| |
| text = re.sub(r"<[^>]+>", " ", text) |
|
|
| |
| text = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", text) |
|
|
| |
| text = unicodedata.normalize("NFC", text) |
|
|
| |
| text = re.sub(r"[\t\r\n]+", " ", text) |
| text = re.sub(r" {2,}", " ", text) |
| text = text.strip() |
|
|
| return text |
|
|
|
|
|
|
|
|
| |
| def extract_year(text: str) -> int | None: |
| """ |
| Extract the most relevant year from claim text. |
| |
| N-1: Handles fiscal year formats: |
| - "FY2024-25" β 2025 (ending year of the fiscal year) |
| - "2023-24" β 2024 (short format, only when followed by nothing suspicious) |
| Falls back to standard 4-digit year (most recent match). |
| """ |
| |
| fy_match = _RE_FY_FULL.search(text) |
| if fy_match: |
| base_year = int(fy_match.group(1)) |
| suffix = int(fy_match.group(2)) |
| |
| century = (base_year // 100) * 100 |
| ending = century + suffix |
| |
| if ending < base_year: |
| ending += 100 |
| return ending |
|
|
| |
| plain_years = _RE_YEAR.findall(text) |
| if not plain_years: |
| fy_short = _RE_FY_SHORT.search(text) |
| if fy_short: |
| base_year = int(fy_short.group(1)) |
| suffix = int(fy_short.group(2)) |
| century = (base_year // 100) * 100 |
| ending = century + suffix |
| if ending < base_year: |
| ending += 100 |
| return ending |
|
|
| |
| if plain_years: |
| return int(plain_years[-1]) |
|
|
| return None |
|
|
|
|
|
|
| def extract_value(text: str) -> float | None: |
| """ |
| Extract the numeric value from a claim. |
| Priority order: |
| 1. Percentage values (β7.5%β, β8 percentβ) |
| 2. N-2: Word-form multipliers (β1.4 billionβ, ββΉ2 lakh croreβ) |
| 3. Plain numbers (fallback, skipping year-like values) |
| """ |
| |
| pct_match = _RE_PERCENT_VAL.search(text) |
| if pct_match: |
| return _clean_number(pct_match.group(1)) |
|
|
| |
| |
| for pattern, multiplier in _WORD_MULTIPLIERS: |
| m = pattern.search(text) |
| if m: |
| base = float(m.group(1).replace(",", "")) |
| return base * multiplier |
|
|
| |
| all_numbers = _RE_NUMBER.findall(text) |
| if not all_numbers: |
| return None |
|
|
| for num_str in all_numbers: |
| if not _RE_YEAR_LIKE.match(num_str.replace(",", "")): |
| return _clean_number(num_str) |
|
|
| |
| return _clean_number(all_numbers[0]) |
|
|
|
|
| def _clean_number(raw): |
| """ |
| Convert a raw number string to a float. |
| Handles commas in both Western (1,000) and Indian (1,00,000) formats. |
| |
| Args: |
| raw (str): A number string like "7.5", "1,00,000", "-2.3" |
| |
| Returns: |
| float: The cleaned number |
| """ |
| cleaned = raw.replace(",", "") |
| return float(cleaned) |
|
|
|
|
| def extract_country(text: str) -> str: |
| """ |
| N-19: Extract the country being referenced and return its ISO 3166 alpha-3 code. |
| Defaults to "IND" (India) if no country is found β B-ware is India-focused. |
| |
| Supported countries: India, USA, UK, China, Japan, Germany, France, |
| Brazil, Canada, Australia, South Korea. |
| """ |
| for pattern, iso3 in _COUNTRY_PATTERNS: |
| if pattern.search(text): |
| return iso3 |
| return "IND" |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def extract_all(text: str) -> dict: |
| """ |
| Orchestrates all extractors and returns a unified result dict. |
| This is the only function main.py endpoints call directly. |
| """ |
| |
| text = preprocess_claim(text) |
|
|
| |
| metric_result = find_metric(text) |
| value = extract_value(text) |
| year = extract_year(text) |
| country = extract_country(text) |
|
|
| |
| |
| metric_name = metric_result["metric"] |
| if metric_name in PERCENTAGE_METRICS: |
| value_type = "percentage" |
| elif _RE_PERCENT_NEAR.search(text): |
| value_type = "percentage" |
| else: |
| value_type = "absolute" |
|
|
| |
| |
| |
| metric_confidence = metric_result["confidence"] |
| if metric_confidence == 0.0: |
| overall_confidence = 0.0 |
| else: |
| weight = 0.50 |
| if value is not None: |
| weight += 0.30 |
| if year is not None: |
| weight += 0.20 |
| overall_confidence = round(metric_confidence * weight, 2) |
|
|
| |
| return { |
| "original_text": text, |
| "metric": metric_name, |
| "value": value, |
| "year": year, |
| "country": country, |
| "value_type": value_type, |
| "confidence": overall_confidence, |
| } |
|
|
| if __name__ == "__main__": |
|
|
| test_claims = [ |
| "India's GDP growth rate was 7.5% in 2024", |
| "Inflation hit 6.2% last year", |
| "The unemployment rate rose to 8% in 2023", |
| "Something about random stuff with no numbers", |
| "GDP grew from 6% in 2023 to 7.5% in 2024", |
| "India's population reached 1.4 billion in 2025", |
| "Fiscal deficit was -3.4 percent of GDP in FY2023-24", |
| "Per capita income is βΉ1,72,000 in 2024", |
| "US GDP growth rate stood at 2.5% in 2023", |
| "China's forex reserves hit $3.2 trillion in 2024", |
| "UK unemployment rate was 4.2% in FY2024-25", |
| ] |
|
|
| print("=" * 70) |
| print("EXTRACTOR β FULL TEST RUN") |
| print("=" * 70) |
|
|
| for claim in test_claims: |
| result = extract_all(claim) |
| print(f"\nClaim: \"{claim}\"") |
| print(f" β Metric: {result['metric']}") |
| print(f" β Value: {result['value']}") |
| print(f" β Year: {result['year']}") |
| print(f" β Country: {result['country']}") |
| print(f" β Value type: {result['value_type']}") |
| print(f" β Confidence: {result['confidence']}") |
|
|
| print("\n" + "=" * 70) |