File size: 5,407 Bytes
dcc24f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""
Claude API Auto-Labeling Script.
Uses Claude API to automatically label financial emails for training.
Generates high-quality labels at scale.
Author: Ranjit Behera
Usage:
export ANTHROPIC_API_KEY="your-key-here"
python scripts/claude_labeling.py --limit 100
"""
import json
import os
import time
from pathlib import Path
from typing import Optional
try:
import anthropic
HAS_ANTHROPIC = True
except ImportError:
HAS_ANTHROPIC = False
print("⚠️ anthropic package not installed. Run: pip install anthropic")
CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl")
OUTPUT_FILE = Path("data/synthetic/claude_labeled.jsonl")
EXTRACTION_PROMPT = """You are a financial entity extraction expert. Extract structured data from this Indian bank email.
EMAIL:
{email_text}
Extract the following fields (return empty string if not found):
- amount: The transaction amount (numbers only, no currency symbols)
- type: "credit" or "debit"
- date: Transaction date (keep original format)
- account: Last 4 digits of account number
- reference: UPI/NEFT/IMPS reference number (12+ digits)
- merchant: Merchant/recipient name (lowercase)
- bank: Bank name (hdfc/icici/sbi/axis/kotak/phonepe/gpay/paytm)
Respond ONLY with valid JSON, no explanation:"""
def extract_with_claude(email_text: str, client) -> Optional[dict]:
"""Use Claude to extract entities from email."""
try:
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=300,
messages=[
{
"role": "user",
"content": EXTRACTION_PROMPT.format(email_text=email_text[:1000])
}
]
)
response_text = message.content[0].text
# Parse JSON
import re
match = re.search(r'\{[^{}]+\}', response_text, re.DOTALL)
if match:
return json.loads(match.group())
except Exception as e:
print(f" Error: {e}")
return None
def load_unlabeled_emails(limit: int = 100) -> list:
"""Load emails that need labeling."""
emails = []
with open(CORPUS_FILE, 'r') as f:
for line in f:
try:
data = json.loads(line)
body = data.get('body', '')
# Filter for transaction emails
body_lower = body.lower()
has_transaction = any(x in body_lower for x in ['debited', 'credited', 'received', 'paid'])
has_amount = any(x in body_lower for x in ['rs.', 'rs ', 'inr', '₹'])
if has_transaction and has_amount and len(body) > 50:
emails.append({
'text': body,
'subject': data.get('subject', ''),
'sender': data.get('sender', '')
})
if len(emails) >= limit:
break
except:
continue
return emails
def run_labeling(limit: int = 100):
"""Run Claude labeling on unlabeled emails."""
print("=" * 60)
print("🤖 CLAUDE API AUTO-LABELING")
print("=" * 60)
if not HAS_ANTHROPIC:
print("❌ Please install anthropic: pip install anthropic")
return
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("❌ Please set ANTHROPIC_API_KEY environment variable")
print(" export ANTHROPIC_API_KEY='your-key-here'")
return
client = anthropic.Anthropic(api_key=api_key)
print(f"\n1. Loading unlabeled emails (limit: {limit})...")
emails = load_unlabeled_emails(limit=limit)
print(f" Found {len(emails)} candidates")
print(f"\n2. Labeling with Claude...")
labeled = []
for i, email in enumerate(emails):
print(f" [{i+1}/{len(emails)}] Extracting...", end=" ")
entities = extract_with_claude(email['text'], client)
if entities and entities.get('amount'):
# Create training sample
prompt = f"""Extract financial entities from this email:
{email['text'][:500]}
Extract: amount, type, date, account, reference, merchant
Output JSON:"""
labeled.append({
'prompt': prompt,
'completion': json.dumps(entities, indent=2),
'source': 'claude_labeled'
})
print(f"✅ {entities.get('amount')}")
else:
print("❌ No entities found")
# Rate limit
time.sleep(0.5)
# Save
print(f"\n3. Saving labeled data...")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w') as f:
for sample in labeled:
f.write(json.dumps(sample) + '\n')
print(f" ✅ Saved {len(labeled)} labeled samples to {OUTPUT_FILE}")
# Show sample
if labeled:
print("\n📧 Sample:")
print(labeled[0]['completion'])
return labeled
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--limit', type=int, default=50, help='Number of emails to label')
args = parser.parse_args()
run_labeling(limit=args.limit)
|