Spaces:
Sleeping
Sleeping
| """ | |
| Entity extraction for financial and crypto-specific entities. | |
| Uses FinBERT for financial NER and custom regex for crypto entities. | |
| """ | |
| import re | |
| import logging | |
| from typing import Dict, List, Optional, Tuple | |
| from datetime import datetime | |
| import os | |
| import requests | |
| logger = logging.getLogger(__name__) | |
| class FinancialEntityExtractor: | |
| """ | |
| Extract financial entities using FinBERT via HuggingFace Inference API. | |
| Identifies: amounts, dates, institutions, transaction types, etc. | |
| Uses API instead of downloading models to save storage. | |
| """ | |
| def __init__(self, model_name: str = "ProsusAI/finbert"): | |
| """ | |
| Initialize FinBERT entity extractor with Inference API. | |
| Args: | |
| model_name: HuggingFace model name | |
| """ | |
| self.model_name = model_name | |
| self.hf_token = os.getenv("HF_TOKEN") | |
| self.api_url = f"https://api-inference.huggingface.co/models/{model_name}" | |
| self._initialized = False | |
| logger.info(f"FinancialEntityExtractor initialized with model: {model_name} (using Inference API)") | |
| def _lazy_load(self): | |
| """Lazy initialization check.""" | |
| if self._initialized: | |
| return | |
| try: | |
| # Test API availability (lightweight check) | |
| if self.hf_token: | |
| logger.info(f"HF Inference API configured for: {self.model_name}") | |
| else: | |
| logger.warning("HF_TOKEN not set - FinBERT API calls may be rate-limited") | |
| self._initialized = True | |
| logger.info("FinBERT Inference API ready") | |
| except Exception as e: | |
| logger.error(f"Error initializing FinBERT API: {e}") | |
| logger.warning("Falling back to regex-only extraction") | |
| self._initialized = True | |
| def _call_finbert_api(self, text: str) -> Optional[Dict]: | |
| """ | |
| Call FinBERT sentiment API for financial context. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Sentiment scores or None if error | |
| """ | |
| if not self.hf_token: | |
| return None | |
| try: | |
| headers = {"Authorization": f"Bearer {self.hf_token}"} | |
| response = requests.post( | |
| self.api_url, | |
| headers=headers, | |
| json={"inputs": text[:512]}, # Limit to 512 chars | |
| timeout=5 | |
| ) | |
| if response.status_code == 200: | |
| return response.json() | |
| else: | |
| logger.warning(f"FinBERT API returned {response.status_code}") | |
| return None | |
| except Exception as e: | |
| logger.debug(f"FinBERT API call failed: {e}") | |
| return None | |
| def extract_amounts(self, text: str) -> List[Dict]: | |
| """ | |
| Extract monetary amounts from text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of dictionaries with amount info | |
| """ | |
| amounts = [] | |
| # Pattern for amounts: $X, €X, £X, ¥X, XM, XB, etc. | |
| patterns = [ | |
| # Currency symbols with numbers | |
| r'[\$€£¥]\s*(\d+(?:,\d{3})*(?:\.\d{2})?)\s*(million|billion|thousand|M|B|K)?', | |
| # Numbers with currency words | |
| r'(\d+(?:,\d{3})*(?:\.\d{2})?)\s*(USD|EUR|GBP|JPY|dollars|euros|pounds)', | |
| # Percentages | |
| r'(\d+(?:\.\d+)?)\s*%', | |
| # Large numbers with abbreviations | |
| r'(\d+(?:\.\d+)?)\s*(million|billion|trillion|M|B|T)', | |
| ] | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| amount_str = match.group(0) | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| amounts.append({ | |
| 'text': amount_str, | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'amount' | |
| }) | |
| logger.debug(f"Extracted {len(amounts)} amounts from text") | |
| return amounts | |
| def extract_dates(self, text: str) -> List[Dict]: | |
| """ | |
| Extract dates and time references. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of date entities | |
| """ | |
| dates = [] | |
| # Date patterns | |
| patterns = [ | |
| # YYYY-MM-DD | |
| r'\d{4}-\d{2}-\d{2}', | |
| # DD/MM/YYYY or MM/DD/YYYY | |
| r'\d{1,2}/\d{1,2}/\d{4}', | |
| # Month Day, Year | |
| r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}', | |
| # Q1 2024, Q2 2025, etc. | |
| r'Q[1-4]\s+\d{4}', | |
| # Fiscal year references | |
| r'FY\s*\d{4}', | |
| # Relative dates | |
| r'(within|by|before|after)\s+\d+\s+(days|weeks|months|years)', | |
| ] | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| date_str = match.group(0) | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| dates.append({ | |
| 'text': date_str, | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'date' | |
| }) | |
| logger.debug(f"Extracted {len(dates)} dates from text") | |
| return dates | |
| def extract_institutions(self, text: str) -> List[Dict]: | |
| """ | |
| Extract financial institutions and regulatory agencies. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of institution entities | |
| """ | |
| institutions = [] | |
| # Known institutions and agencies | |
| known_entities = [ | |
| # US | |
| 'SEC', 'Securities and Exchange Commission', 'FinCEN', 'CFTC', | |
| 'Federal Reserve', 'OCC', 'FDIC', 'Treasury Department', | |
| # EU | |
| 'ESMA', 'EBA', 'European Central Bank', 'ECB', | |
| # Singapore | |
| 'MAS', 'Monetary Authority of Singapore', | |
| # UK | |
| 'FCA', 'Financial Conduct Authority', 'Bank of England', | |
| # UAE | |
| 'VARA', 'Virtual Asset Regulatory Authority', | |
| # Exchanges | |
| 'Coinbase', 'Binance', 'Kraken', 'Gemini', 'Bitstamp', | |
| # Banks | |
| 'JPMorgan', 'Goldman Sachs', 'Morgan Stanley', 'Citibank', | |
| ] | |
| for entity in known_entities: | |
| # Case-insensitive search | |
| pattern = r'\b' + re.escape(entity) + r'\b' | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| institutions.append({ | |
| 'text': match.group(0), | |
| 'normalized': entity, | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'institution' | |
| }) | |
| logger.debug(f"Extracted {len(institutions)} institutions from text") | |
| return institutions | |
| def extract_all(self, text: str) -> Dict[str, List[Dict]]: | |
| """ | |
| Extract all financial entities from text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Dictionary of entity lists by type | |
| """ | |
| self._lazy_load() | |
| return { | |
| 'amounts': self.extract_amounts(text), | |
| 'dates': self.extract_dates(text), | |
| 'institutions': self.extract_institutions(text) | |
| } | |
| class CryptoEntityExtractor: | |
| """ | |
| Extract crypto-specific entities: tokens, addresses, protocols, etc. | |
| """ | |
| def __init__(self): | |
| """Initialize crypto entity extractor.""" | |
| logger.info("CryptoEntityExtractor initialized") | |
| def extract_tokens(self, text: str) -> List[Dict]: | |
| """ | |
| Extract cryptocurrency tokens and coin names. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of token entities | |
| """ | |
| tokens = [] | |
| # Known cryptocurrencies | |
| known_tokens = [ | |
| # Major coins | |
| ('Bitcoin', 'BTC'), ('Ethereum', 'ETH'), ('Ripple', 'XRP'), | |
| ('Cardano', 'ADA'), ('Solana', 'SOL'), ('Polkadot', 'DOT'), | |
| ('Avalanche', 'AVAX'), ('Polygon', 'MATIC'), ('Chainlink', 'LINK'), | |
| # Stablecoins | |
| ('Tether', 'USDT'), ('USD Coin', 'USDC'), ('Dai', 'DAI'), | |
| ('Binance USD', 'BUSD'), ('TrueUSD', 'TUSD'), | |
| # DeFi tokens | |
| ('Uniswap', 'UNI'), ('Aave', 'AAVE'), ('Compound', 'COMP'), | |
| ('Maker', 'MKR'), ('Curve', 'CRV'), | |
| ] | |
| for full_name, symbol in known_tokens: | |
| # Search for both full name and symbol | |
| for term in [full_name, symbol]: | |
| pattern = r'\b' + re.escape(term) + r'\b' | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| tokens.append({ | |
| 'text': match.group(0), | |
| 'name': full_name, | |
| 'symbol': symbol, | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'token' | |
| }) | |
| # Generic token pattern: words ending in "coin" or "token" | |
| generic_pattern = r'\b([A-Z][a-z]+(?:coin|token|Token|Coin))\b' | |
| matches = re.finditer(generic_pattern, text) | |
| for match in matches: | |
| token_name = match.group(1) | |
| # Skip if already captured | |
| if any(t['text'].lower() == token_name.lower() for t in tokens): | |
| continue | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| tokens.append({ | |
| 'text': token_name, | |
| 'name': token_name, | |
| 'symbol': 'UNKNOWN', | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'token' | |
| }) | |
| logger.debug(f"Extracted {len(tokens)} tokens from text") | |
| return tokens | |
| def extract_addresses(self, text: str) -> List[Dict]: | |
| """ | |
| Extract cryptocurrency wallet addresses. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of address entities | |
| """ | |
| addresses = [] | |
| # Ethereum-style addresses (0x followed by 40 hex chars) | |
| eth_pattern = r'\b0x[a-fA-F0-9]{40}\b' | |
| matches = re.finditer(eth_pattern, text) | |
| for match in matches: | |
| addresses.append({ | |
| 'text': match.group(0), | |
| 'blockchain': 'ethereum', | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'type': 'address' | |
| }) | |
| # Bitcoin-style addresses (starts with 1, 3, or bc1) | |
| btc_patterns = [ | |
| r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b', # Legacy/P2SH | |
| r'\bbc1[a-z0-9]{39,59}\b' # Bech32 | |
| ] | |
| for pattern in btc_patterns: | |
| matches = re.finditer(pattern, text) | |
| for match in matches: | |
| addresses.append({ | |
| 'text': match.group(0), | |
| 'blockchain': 'bitcoin', | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'type': 'address' | |
| }) | |
| logger.debug(f"Extracted {len(addresses)} addresses from text") | |
| return addresses | |
| def extract_protocols(self, text: str) -> List[Dict]: | |
| """ | |
| Extract DeFi protocols and blockchain platforms. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of protocol entities | |
| """ | |
| protocols = [] | |
| known_protocols = [ | |
| # DeFi protocols | |
| 'Uniswap', 'SushiSwap', 'PancakeSwap', 'Aave', 'Compound', | |
| 'MakerDAO', 'Curve Finance', 'Balancer', 'Yearn Finance', | |
| # Layer 1 blockchains | |
| 'Ethereum', 'Bitcoin', 'Solana', 'Cardano', 'Avalanche', | |
| 'Polkadot', 'Cosmos', 'Algorand', 'Near Protocol', | |
| # Layer 2s | |
| 'Polygon', 'Arbitrum', 'Optimism', 'zkSync', 'StarkNet', | |
| # Other | |
| 'IPFS', 'Chainlink', 'The Graph', 'Filecoin', | |
| ] | |
| for protocol in known_protocols: | |
| pattern = r'\b' + re.escape(protocol) + r'\b' | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| protocols.append({ | |
| 'text': match.group(0), | |
| 'name': protocol, | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'protocol' | |
| }) | |
| logger.debug(f"Extracted {len(protocols)} protocols from text") | |
| return protocols | |
| def extract_activities(self, text: str) -> List[Dict]: | |
| """ | |
| Extract crypto activity mentions (staking, lending, etc.). | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of activity entities | |
| """ | |
| activities = [] | |
| activity_keywords = [ | |
| 'staking', 'lending', 'borrowing', 'yield farming', 'liquidity mining', | |
| 'trading', 'swapping', 'mining', 'minting', 'burning', | |
| 'governance', 'voting', 'delegation', 'custody', 'custodial', | |
| 'non-custodial', 'DeFi', 'NFT', 'token sale', 'ICO', 'IEO', | |
| 'airdrop', 'fork', 'bridge', 'cross-chain' | |
| ] | |
| for activity in activity_keywords: | |
| pattern = r'\b' + re.escape(activity) + r'\b' | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| context_start = max(0, match.start() - 50) | |
| context_end = min(len(text), match.end() + 50) | |
| context = text[context_start:context_end] | |
| activities.append({ | |
| 'text': match.group(0), | |
| 'activity': activity, | |
| 'start': match.start(), | |
| 'end': match.end(), | |
| 'context': context.strip(), | |
| 'type': 'activity' | |
| }) | |
| logger.debug(f"Extracted {len(activities)} activities from text") | |
| return activities | |
| def extract_all(self, text: str) -> Dict[str, List[Dict]]: | |
| """ | |
| Extract all crypto entities from text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Dictionary of entity lists by type | |
| """ | |
| return { | |
| 'tokens': self.extract_tokens(text), | |
| 'addresses': self.extract_addresses(text), | |
| 'protocols': self.extract_protocols(text), | |
| 'activities': self.extract_activities(text) | |
| } | |
| class EntityExtractor: | |
| """ | |
| Combined entity extractor for both financial and crypto entities. | |
| """ | |
| def __init__(self): | |
| """Initialize combined entity extractor.""" | |
| self.financial_extractor = FinancialEntityExtractor() | |
| self.crypto_extractor = CryptoEntityExtractor() | |
| logger.info("EntityExtractor initialized") | |
| def extract_all_entities(self, text: str) -> Dict: | |
| """ | |
| Extract all entities (financial + crypto) from text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Dictionary containing all extracted entities | |
| """ | |
| if not text: | |
| return {'financial': {}, 'crypto': {}, 'summary': {}} | |
| # Extract financial entities | |
| financial_entities = self.financial_extractor.extract_all(text) | |
| # Extract crypto entities | |
| crypto_entities = self.crypto_extractor.extract_all(text) | |
| # Summary statistics | |
| summary = { | |
| 'total_entities': ( | |
| sum(len(v) for v in financial_entities.values()) + | |
| sum(len(v) for v in crypto_entities.values()) | |
| ), | |
| 'financial_count': sum(len(v) for v in financial_entities.values()), | |
| 'crypto_count': sum(len(v) for v in crypto_entities.values()), | |
| 'has_amounts': len(financial_entities.get('amounts', [])) > 0, | |
| 'has_dates': len(financial_entities.get('dates', [])) > 0, | |
| 'has_tokens': len(crypto_entities.get('tokens', [])) > 0, | |
| 'has_addresses': len(crypto_entities.get('addresses', [])) > 0, | |
| } | |
| result = { | |
| 'financial': financial_entities, | |
| 'crypto': crypto_entities, | |
| 'summary': summary, | |
| 'extracted_at': datetime.now().isoformat() | |
| } | |
| logger.info( | |
| f"Extracted {summary['total_entities']} entities " | |
| f"({summary['financial_count']} financial, {summary['crypto_count']} crypto)" | |
| ) | |
| return result | |
| # Convenience function | |
| def extract_entities(text: str) -> Dict: | |
| """ | |
| Quick extract all entities from text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Dictionary of extracted entities | |
| """ | |
| extractor = EntityExtractor() | |
| return extractor.extract_all_entities(text) | |
| if __name__ == "__main__": | |
| # Example usage | |
| sample_text = """ | |
| The SEC announced new crypto custody rules on January 15, 2024. | |
| Exchanges handling over $10 million in Bitcoin (BTC) and Ethereum (ETH) | |
| must register by Q3 2024. Staking services and DeFi protocols like | |
| Uniswap may face additional scrutiny. Coinbase and Binance have 90 days | |
| to comply with the new requirements. | |
| """ | |
| entities = extract_entities(sample_text) | |
| print("\n=== Financial Entities ===") | |
| for entity_type, items in entities['financial'].items(): | |
| print(f"\n{entity_type.upper()}: {len(items)}") | |
| for item in items[:3]: # Show first 3 | |
| print(f" - {item['text']}") | |
| print("\n=== Crypto Entities ===") | |
| for entity_type, items in entities['crypto'].items(): | |
| print(f"\n{entity_type.upper()}: {len(items)}") | |
| for item in items[:3]: | |
| print(f" - {item.get('text', item.get('name', 'N/A'))}") | |
| print(f"\n=== Summary ===") | |
| print(f"Total entities: {entities['summary']['total_entities']}") | |