Spaces:

Sowrabhm
/

fintext-extractor

Running

App Files Files Community

Sowrabhm commited on Mar 8

Commit

ca3ccd1

verified ·

1 Parent(s): 0ecdc93

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

app.py +94 -0
fintext/__init__.py +4 -0
fintext/extractor.py +261 -0
fintext/utils.py +241 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+fintext-extractor: Transaction Extraction from Bank SMS
+Gradio demo for the two-stage NER pipeline that extracts structured
+transaction data from bank SMS/notifications using ONNX Runtime.
+"""
+import json
+import time
+import gradio as gr
+from fintext import FintextExtractor
+# Load model at startup (downloads ~1.8GB on first run)
+print("Loading fintext-extractor model...")
+extractor = FintextExtractor.from_pretrained("Sowrabhm/fintext-extractor", precision="fp16")
+print("Model loaded!")
+def extract_transaction(sms_text: str) -> str:
+    """Run two-stage extraction and return formatted JSON."""
+    if not sms_text or not sms_text.strip():
+        return json.dumps({"error": "Please enter SMS text"}, indent=2)
+    start = time.time()
+    # Stage 1: Classification
+    cls_result = extractor.classify(sms_text)
+    # Stage 2: Full extraction
+    result = extractor.extract(sms_text)
+    elapsed_ms = (time.time() - start) * 1000
+    # Build rich output
+    output = {
+        "is_transaction": result["is_transaction"],
+        "confidence": round(cls_result["confidence"], 3),
+    }
+    if result["is_transaction"]:
+        output["transaction_amount"] = result["transaction_amount"]
+        output["transaction_type"] = result["transaction_type"]
+        output["transaction_date"] = result["transaction_date"]
+        output["transaction_description"] = result["transaction_description"]
+        output["masked_account_digits"] = result["masked_account_digits"]
+    output["inference_time_ms"] = round(elapsed_ms, 1)
+    return json.dumps(output, indent=2, default=str)
+# Example SMS messages (all synthetic)
+examples = [
+    ["Rs.5,000 debited from a/c XX1234 for Amazon Pay on 08-Mar-26"],
+    ["Credit Alert: INR 25,000 credited to a/c XX5678 on 15-Jan-2026"],
+    ["INR 3,499.00 paid to Netflix via card ending 9876 on 01-Feb-2026"],
+    ["Dear Customer, Rs.850.50 has been credited to your a/c XX2468 on 05-Mar-2026. UPI Ref: 678912345"],
+    ["Transaction of Rs.15,750 at Flipkart on 28-Feb-2026 from card XX3579. Avl bal: Rs.42,300"],
+    ["OTP 483921 for transaction of Rs.1,200. Do not share with anyone."],
+    ["Your FD of Rs.50,000 matures on 20-Apr-2026. Visit branch."],
+    ["Reminder: EMI of Rs.12,500 due on 10-Mar-2026 for loan XX4321"],
+]
+# Build Gradio interface
+demo = gr.Interface(
+    fn=extract_transaction,
+    inputs=gr.Textbox(
+        label="SMS / Notification Text",
+        placeholder="Paste a bank SMS here...",
+        lines=3,
+    ),
+    outputs=gr.Code(
+        label="Extracted Transaction Data",
+        language="json",
+    ),
+    title="fintext-extractor",
+    description=(
+        "Extract structured transaction data from bank SMS using on-device NER. "
+        "Two-stage pipeline: DeBERTa classifier filters non-transactions, "
+        "then GLiNER2 extracts amount, date, type, description, and account digits.\n\n"
+        "**Try the examples below or paste your own SMS text.**"
+    ),
+    article=(
+        "**Links:** "
+        "[Model](https://huggingface.co/Sowrabhm/fintext-extractor) | "
+        "[GitHub](https://github.com/sowrabhmv/fintext-extractor) | "
+        "License: CC-BY-4.0"
+    ),
+    examples=examples,
+    cache_examples=False,
+)
+if __name__ == "__main__":
+    demo.launch()

fintext/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from fintext.extractor import FintextExtractor
+__all__ = ["FintextExtractor"]
+__version__ = "1.0.0"

fintext/extractor.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Two-stage ONNX inference pipeline for transaction extraction from SMS text.
+Stage 1 — Classification: determines whether the message describes a completed
+financial transaction (debit or credit).
+Stage 2 — Extraction: pulls structured fields (amount, date, type, description,
+masked account digits) from messages classified as transactions.
+"""
+from __future__ import annotations
+import os
+import numpy as np
+import onnxruntime as ort
+from tokenizers import Tokenizer
+from fintext.utils import (
+    CLASSIFICATION_LABELS,
+    EXTRACTION_FIELDS,
+    SCHEMA_TOKENS,
+    decode_spans,
+    normalize_date,
+    parse_amount,
+    split_into_words,
+)
+class FintextExtractor:
+    """Two-stage ONNX inference for transaction extraction from SMS text."""
+    def __init__(self, model_dir: str, precision: str = "fp16") -> None:
+        """Load ONNX models and tokenizers from a local directory.
+        Args:
+            model_dir: Path to directory containing onnx/, tokenizer/,
+                tokenizer_extraction/ sub-directories.
+            precision: ``"fp16"`` or ``"fp32"`` -- which ONNX model variant to
+                load.
+        """
+        if precision not in ("fp16", "fp32"):
+            raise ValueError(f"precision must be 'fp16' or 'fp32', got '{precision}'")
+        self._precision = precision
+        self._model_dir = model_dir
+        # ONNX session options
+        opts = ort.SessionOptions()
+        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        opts.intra_op_num_threads = 4
+        # Load classification model
+        cls_path = os.path.join(model_dir, "onnx", f"deberta_classifier_{precision}.onnx")
+        self._cls_session = ort.InferenceSession(
+            cls_path, opts, providers=["CPUExecutionProvider"]
+        )
+        # Load extraction model
+        ext_path = os.path.join(model_dir, "onnx", f"extraction_full_{precision}.onnx")
+        self._ext_session = ort.InferenceSession(
+            ext_path, opts, providers=["CPUExecutionProvider"]
+        )
+        # Load tokenizers
+        cls_tok_path = os.path.join(model_dir, "tokenizer", "tokenizer.json")
+        ext_tok_path = os.path.join(model_dir, "tokenizer_extraction", "tokenizer.json")
+        self._cls_tokenizer = Tokenizer.from_file(cls_tok_path)
+        self._ext_tokenizer = Tokenizer.from_file(ext_tok_path)
+        # Configure classification tokenizer
+        self._cls_tokenizer.enable_truncation(max_length=128)
+        self._cls_tokenizer.enable_padding(length=128)
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str = "Sowrabhm/fintext-extractor",
+        precision: str = "fp16",
+    ) -> FintextExtractor:
+        """Download models from Hugging Face Hub and load them.
+        Args:
+            repo_id: Hugging Face model repo ID.
+            precision: ``"fp16"`` or ``"fp32"``.
+        """
+        from huggingface_hub import snapshot_download
+        # Download only the files needed for the requested precision
+        allow = [
+            f"onnx/deberta_classifier_{precision}.onnx",
+            f"onnx/deberta_classifier_{precision}.onnx.data",
+            f"onnx/extraction_full_{precision}.onnx",
+            f"onnx/extraction_full_{precision}.onnx.data",
+            "tokenizer/*",
+            "tokenizer_extraction/*",
+            "config.json",
+        ]
+        local_dir = snapshot_download(repo_id, allow_patterns=allow)
+        return cls(local_dir, precision=precision)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def extract(self, text: str, received_date: str | None = None) -> dict:
+        """Run full two-stage pipeline on a single SMS text.
+        Args:
+            text: SMS / notification text.
+            received_date: Optional fallback date in DD-MM-YYYY format.
+        Returns:
+            dict with keys: ``is_transaction``, ``transaction_amount``,
+            ``transaction_type``, ``transaction_date``,
+            ``transaction_description``, ``masked_account_digits``.
+        """
+        # Stage 1: Classification
+        cls_result = self.classify(text)
+        if not cls_result["is_transaction"]:
+            return {
+                "is_transaction": False,
+                "transaction_amount": None,
+                "transaction_type": None,
+                "transaction_date": None,
+                "transaction_description": None,
+                "masked_account_digits": None,
+            }
+        # Stage 2: Extraction
+        return self._extract_fields(text, received_date)
+    def classify(self, text: str) -> dict:
+        """Run classification only (stage 1).
+        Returns:
+            dict with ``is_transaction`` (bool) and ``confidence`` (float).
+        """
+        # Tokenize with padding/truncation to 128
+        encoded = self._cls_tokenizer.encode(text)
+        input_ids = np.array([encoded.ids], dtype=np.int64)
+        attention_mask = np.array([encoded.attention_mask], dtype=np.int64)
+        # Run classification
+        outputs = self._cls_session.run(
+            None,
+            {"input_ids": input_ids, "attention_mask": attention_mask},
+        )
+        logits = outputs[0][0]  # [2] -- logits for [non-transaction, transaction]
+        # Softmax
+        exp_logits = np.exp(logits - np.max(logits))
+        probs = exp_logits / exp_logits.sum()
+        is_transaction = bool(probs[1] > 0.5)
+        confidence = float(probs[1]) if is_transaction else float(probs[0])
+        return {"is_transaction": is_transaction, "confidence": confidence}
+    def extract_batch(
+        self, texts: list[str], received_date: str | None = None
+    ) -> list[dict]:
+        """Run extraction on multiple texts sequentially.
+        Args:
+            texts: List of SMS / notification texts.
+            received_date: Optional fallback date.
+        Returns:
+            List of extraction result dicts.
+        """
+        return [self.extract(t, received_date) for t in texts]
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+    def _extract_fields(self, text: str, received_date: str | None = None) -> dict:
+        """Stage 2: Extract transaction fields using the extraction model."""
+        # Split text into words with character spans
+        word_info = split_into_words(text)
+        words = [w for w, _, _ in word_info]
+        word_spans = [(s, e) for _, s, e in word_info]
+        num_words = len(words)
+        text_words_lower = [w.lower() for w in words]
+        # Build combined schema + text input
+        combined_tokens = SCHEMA_TOKENS + text_words_lower
+        schema_len = len(SCHEMA_TOKENS)
+        # Subword-tokenize each combined token, build words_mask
+        all_subword_ids: list[int] = []
+        words_mask_values: list[int] = []
+        for i, token in enumerate(combined_tokens):
+            encoded = self._ext_tokenizer.encode(token, add_special_tokens=False)
+            subword_ids = encoded.ids
+            all_subword_ids.extend(subword_ids)
+            if i >= schema_len:
+                # Text word: first subword gets 1-indexed word number
+                word_number = i - schema_len + 1
+                words_mask_values.append(word_number)
+                words_mask_values.extend([0] * (len(subword_ids) - 1))
+            else:
+                # Schema token: all get 0
+                words_mask_values.extend([0] * len(subword_ids))
+        # Truncate to 512 if needed
+        max_len = 512
+        seq_len = min(len(all_subword_ids), max_len)
+        # Build tensors
+        input_ids = np.array([all_subword_ids[:seq_len]], dtype=np.int64)
+        attention_mask = np.ones((1, seq_len), dtype=np.int64)
+        words_mask = np.array([words_mask_values[:seq_len]], dtype=np.int64)
+        text_lengths = np.array([num_words], dtype=np.int64)
+        # Run extraction model
+        outputs = self._ext_session.run(
+            None,
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "words_mask": words_mask,
+                "text_lengths": text_lengths,
+            },
+        )
+        type_logits = outputs[0][0]  # [2] -- softmax probs for [DEBIT, CREDIT]
+        span_scores = outputs[1][0]  # [4, num_words, max_width]
+        # Decode transaction type
+        transaction_type = CLASSIFICATION_LABELS[int(np.argmax(type_logits))]
+        # Decode entity spans
+        spans = decode_spans(span_scores, text, words, word_spans)
+        # Post-process fields
+        raw_amount = spans.get("transaction_amount")
+        raw_date = spans.get("transaction_date")
+        raw_desc = spans.get("transaction_description")
+        raw_digits = spans.get("masked_account_digits")
+        amount = parse_amount(raw_amount[0]) if raw_amount else None
+        date = normalize_date(raw_date[0], received_date) if raw_date else received_date
+        description = raw_desc[0] if raw_desc else None
+        digits = raw_digits[0] if raw_digits else None
+        # Validate: must have amount + type to be a valid transaction
+        is_transaction = amount is not None and transaction_type is not None
+        return {
+            "is_transaction": is_transaction,
+            "transaction_amount": amount,
+            "transaction_type": transaction_type if is_transaction else None,
+            "transaction_date": date,
+            "transaction_description": description,
+            "masked_account_digits": digits,
+        }

fintext/utils.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""Post-processing utilities for transaction extraction.
+Ported from the Android Kotlin GLiNER2 ONNX runner. Provides tokenisation,
+span decoding, amount parsing, and date normalisation for bank SMS messages.
+"""
+from __future__ import annotations
+import re
+from typing import Optional
+import numpy as np
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+SCHEMA_TOKENS: list[str] = [
+    "(", "[P]", "transaction_type",
+    "(", "[L]", "DEBIT", "[L]", "CREDIT", ")", ")",
+    "[SEP_STRUCT]",
+    "(", "[P]", "transaction_info",
+    "(", "[C]", "transaction_amount",
+    "[C]", "transaction_date",
+    "[C]", "transaction_description",
+    "[C]", "masked_account_digits", ")", ")",
+    "[SEP_TEXT]",
+]
+"""Fixed schema token sequence matching the exported ONNX model."""
+EXTRACTION_FIELDS: list[str] = [
+    "transaction_amount",
+    "transaction_date",
+    "transaction_description",
+    "masked_account_digits",
+]
+"""Ordered field names for the span-extraction head."""
+CLASSIFICATION_LABELS: list[str] = ["DEBIT", "CREDIT"]
+"""Labels emitted by the classification head."""
+# ---------------------------------------------------------------------------
+# Tokenisation
+# ---------------------------------------------------------------------------
+_WORD_PATTERN = re.compile(
+    r"(?:https?://\S+|www\.\S+)"          # URLs
+    r"|[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}"  # emails
+    r"|@[a-z0-9_]+"                        # @-mentions
+    r"|\w+(?:[-_]\w+)*"                    # words (with hyphens/underscores)
+    r"|\S",                                # single non-space fallback
+    re.IGNORECASE,
+)
+def split_into_words(text: str) -> list[tuple[str, int, int]]:
+    """Whitespace-aware tokeniser matching GLiNER2's WhitespaceTokenSplitter.
+    Returns a list of *(word, char_start, char_end)* tuples.
+    """
+    return [(m.group(), m.start(), m.end()) for m in _WORD_PATTERN.finditer(text)]
+# ---------------------------------------------------------------------------
+# Amount parsing
+# ---------------------------------------------------------------------------
+_CURRENCY_PATTERN = re.compile(r"(?:Rs\.?|INR|₹)\s*", re.IGNORECASE)
+_NUMBER_PATTERN = re.compile(r"[\d,]+(?:\.\d+)?")
+def parse_amount(raw: str) -> float | None:
+    """Strip currency symbols and extract the first numeric value.
+    Handles Rs., Rs, INR, and the rupee sign.  Commas are removed before
+    conversion.  Returns *None* when no number can be found.
+    """
+    cleaned = _CURRENCY_PATTERN.sub("", raw).strip()
+    match = _NUMBER_PATTERN.search(cleaned)
+    if not match:
+        return None
+    try:
+        return float(match.group().replace(",", ""))
+    except ValueError:
+        return None
+# ---------------------------------------------------------------------------
+# Date normalisation
+# ---------------------------------------------------------------------------
+_MONTH_MAP: dict[str, int] = {
+    "jan": 1, "january": 1,
+    "feb": 2, "february": 2,
+    "mar": 3, "march": 3,
+    "apr": 4, "april": 4,
+    "may": 5,
+    "jun": 6, "june": 6,
+    "jul": 7, "july": 7,
+    "aug": 8, "august": 8,
+    "sep": 9, "september": 9,
+    "oct": 10, "october": 10,
+    "nov": 11, "november": 11,
+    "dec": 12, "december": 12,
+}
+# Patterns ordered from most specific to least specific.
+_DATE_PATTERNS: list[re.Pattern[str]] = [
+    # DD-MM-YYYY or DD/MM/YYYY
+    re.compile(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})"),
+    # DD-Mon-YYYY or DD/Mon/YYYY
+    re.compile(
+        r"(\d{1,2})[/\-]([A-Za-z]+)[/\-](\d{4})"
+    ),
+    # DD-MM-YY or DD/MM/YY
+    re.compile(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})(?!\d)"),
+    # DD-Mon-YY or DD/Mon/YY
+    re.compile(
+        r"(\d{1,2})[/\-]([A-Za-z]+)[/\-](\d{2})(?!\d)"
+    ),
+    # DDMonYYYY  (e.g. 23Dec2025)
+    re.compile(r"(\d{1,2})([A-Za-z]+)(\d{4})"),
+]
+def _century_window(yy: int) -> int:
+    """Apply century windowing: YY > 50 -> 19YY, else 20YY."""
+    return 1900 + yy if yy > 50 else 2000 + yy
+def _parse_month(token: str) -> int | None:
+    """Return 1-12 for a numeric or named month, or *None*."""
+    if token.isdigit():
+        val = int(token)
+        return val if 1 <= val <= 12 else None
+    return _MONTH_MAP.get(token.lower())
+def normalize_date(raw: str, received_date: str | None = None) -> str | None:
+    """Parse a date string in various Indian SMS formats and return DD-MM-YYYY.
+    Supported input formats:
+        DD-MM-YYYY, DD/MM/YYYY, DD-MM-YY, DD/MM/YY,
+        DD-Mon-YYYY, DD-Mon-YY, DDMonYYYY.
+    Falls back to *received_date* (which must already be DD-MM-YYYY) when
+    *raw* cannot be parsed.  Returns *None* if nothing works.
+    """
+    for pattern in _DATE_PATTERNS:
+        m = pattern.search(raw)
+        if not m:
+            continue
+        day_s, month_s, year_s = m.group(1), m.group(2), m.group(3)
+        day = int(day_s)
+        month = _parse_month(month_s)
+        if month is None:
+            continue
+        year = int(year_s)
+        if year < 100:
+            year = _century_window(year)
+        # Basic validation
+        if not (2000 <= year <= 2100):
+            continue
+        if not (1 <= month <= 12):
+            continue
+        if not (1 <= day <= 31):
+            continue
+        return f"{day:02d}-{month:02d}-{year}"
+    # Fallback
+    if received_date is not None:
+        return received_date
+    return None
+# ---------------------------------------------------------------------------
+# Span decoding
+# ---------------------------------------------------------------------------
+def decode_spans(
+    span_scores: np.ndarray,
+    text: str,
+    words: list[str],
+    word_spans: list[tuple[int, int]],
+    threshold: float = 0.3,
+) -> dict[str, Optional[tuple[str, float]]]:
+    """Decode the span-extraction head output into field values.
+    Parameters
+    ----------
+    span_scores:
+        Array of shape ``[4, num_words, max_width]`` — one slice per
+        extraction field.
+    text:
+        The original SMS text.
+    words:
+        Tokenised words (from :func:`split_into_words`).
+    word_spans:
+        ``(char_start, char_end)`` pairs for each word.
+    threshold:
+        Minimum confidence to accept a span.
+    Returns
+    -------
+    dict
+        Mapping of field name to ``(extracted_text, confidence)`` or
+        *None* when no span exceeds *threshold*.
+    """
+    num_words = len(words)
+    result: dict[str, Optional[tuple[str, float]]] = {}
+    for field_idx, field_name in enumerate(EXTRACTION_FIELDS):
+        field_scores = span_scores[field_idx]  # [num_words, max_width]
+        best_score = 0.0
+        best_span: tuple[int, int, float] | None = None
+        for start in range(min(num_words, field_scores.shape[0])):
+            for width in range(field_scores.shape[1]):
+                end = start + width
+                if end >= num_words:
+                    break
+                score = float(field_scores[start, width])
+                if score > best_score and score > threshold:
+                    best_score = score
+                    best_span = (start, end, score)
+        if best_span is not None:
+            s, e, conf = best_span
+            char_start = word_spans[s][0]
+            char_end = word_spans[e][1]
+            result[field_name] = (text[char_start:char_end], conf)
+        else:
+            result[field_name] = None
+    return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+onnxruntime>=1.17.0
+numpy>=1.24.0
+huggingface_hub>=0.20.0
+tokenizers>=0.15.0
+gradio>=4.0.0