Spaces:

Vaibuzzz
/

financial-intelligence-ai

Running

App Files Files Community

financial-intelligence-ai / scripts /ingest_kaggle_data.py

Vaibuzzz

Upload folder using huggingface_hub

10ff0db verified 3 days ago

raw

history blame contribute delete

19.4 kB

	"""
	Real-World Dataset Ingestion Pipeline.

	Ingests financial document datasets from Kaggle/HuggingFace and generates
	ground truth labels using GPT-4o-mini in our exact Pydantic schema.

	Supported datasets:
	1. SROIE v2 (Kaggle: urbikn/sroie-datasetv2) — 973 scanned receipts
	2. CORD v1 (HuggingFace: naver-clova-ix/cord-v1) — 1,000 receipts

	Usage:
	# Process SROIE data (download from Kaggle first)
	python scripts/ingest_kaggle_data.py --source sroie \
	--sroie-path data/raw/sroie \
	--output data/training/real_sroie.jsonl \
	--max-docs 200

	# Process CORD data (downloads from HuggingFace automatically)
	python scripts/ingest_kaggle_data.py --source cord \
	--output data/training/real_cord.jsonl \
	--max-docs 100

	# Merge all sources into final training set
	python scripts/ingest_kaggle_data.py --source merge \
	--output data/training/merged_raw.jsonl
	"""

	import os
	import sys
	import json
	import glob
	import time
	import argparse
	from typing import Optional, List
	from dotenv import load_dotenv

	load_dotenv()

	# Add project root to path
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	# GPT-4o-mini prompt (same as generate_ground_truth.py but optimized for noisy OCR)
	LABELING_PROMPT = """You are a financial document extraction expert. Given raw OCR text from a scanned receipt or invoice, you must:

	1. Identify the document type: "invoice", "purchase_order", "receipt", or "bank_statement"
	2. Extract ALL fields into the exact JSON schema below
	3. Detect any anomalies present in the document

	IMPORTANT RULES:
	- Output ONLY valid JSON — no markdown, no explanations, no code blocks
	- This is REAL OCR text — expect noise, typos, and formatting issues. Do your best.
	- If a field is not present or unreadable, use null
	- If no anomalies exist, return an empty "flags" array: []
	- All dates should be normalized to YYYY-MM-DD format
	- All monetary amounts should be numbers (not strings)
	- confidence_score should reflect OCR quality and your certainty (0.0 to 1.0)

	Anomaly categories to check:
	- arithmetic_error: math that doesn't add up
	- missing_field: required fields absent from the document
	- format_anomaly: impossible dates, negative quantities, duplicate entries
	- business_logic: round-number fraud, extreme amounts, unusual terms
	- cross_field: mismatched references, currency conflicts

	Required JSON Schema:
	{
	"common": {
	"document_type": "receipt",
	"date": "YYYY-MM-DD or null",
	"issuer": {"name": "string or null", "address": "string or null"},
	"recipient": null,
	"total_amount": number_or_null,
	"currency": "USD"
	},
	"line_items": [
	{"description": "string", "quantity": number, "unit_price": number, "amount": number}
	],
	"type_specific": {
	"receipt_number": "string or null",
	"payment_method": "string or null",
	"store_location": "string or null",
	"cashier": "string or null"
	},
	"flags": [],
	"confidence_score": 0.85
	}"""


	def load_sroie_data(sroie_path: str, max_docs: Optional[int] = None) -> List[dict]:
	"""
	Load SROIE dataset from local directory.

	Expected structure:
	sroie_path/
	├── train/
	│ ├── img/ # Receipt images (we skip these)
	│ ├── box/ # Bounding box + text (OCR output)
	│ └── entities/ # Key-value labels (company, date, address, total)
	└── test/
	├── img/
	└── box/

	We read from box/ (OCR text) and entities/ (basic labels for reference).
	"""
	documents = []

	# Look for OCR text files in both train and test
	for split in ["train", "test"]:
	box_dir = os.path.join(sroie_path, split, "box")
	entities_dir = os.path.join(sroie_path, split, "entities")

	if not os.path.exists(box_dir):
	# Try alternative structure (flat)
	box_dir = os.path.join(sroie_path, split)
	entities_dir = os.path.join(sroie_path, split)

	if not os.path.exists(box_dir):
	print(f" [SKIP] Directory not found: {box_dir}")
	continue

	txt_files = sorted(glob.glob(os.path.join(box_dir, "*.txt")))
	print(f" Found {len(txt_files)} OCR text files in {split}/")

	for txt_file in txt_files:
	basename = os.path.splitext(os.path.basename(txt_file))[0]

	# Read OCR text (format: x1,y1,x2,y2,x3,y3,x4,y4,text)
	try:
	with open(txt_file, "r", encoding="utf-8", errors="ignore") as f:
	lines = f.readlines()

	# Extract just the text portions (after the 8 coordinate values)
	text_parts = []
	for line in lines:
	line = line.strip()
	if not line:
	continue
	# SROIE format: coords,coords,coords,coords,text
	# Split by comma, take everything after 8th value
	parts = line.split(",")
	if len(parts) > 8:
	text = ",".join(parts[8:]).strip()
	if text:
	text_parts.append(text)
	elif len(parts) == 1:
	# Some files have just text
	text_parts.append(line)

	raw_text = "\n".join(text_parts)

	if not raw_text.strip() or len(raw_text) < 10:
	continue

	# Read entity labels if available (for reference)
	entity_file = os.path.join(entities_dir, basename + ".txt")
	original_labels = {}
	if os.path.exists(entity_file):
	try:
	with open(entity_file, "r", encoding="utf-8") as ef:
	entity_data = json.load(ef)
	original_labels = entity_data
	except (json.JSONDecodeError, Exception):
	pass

	documents.append({
	"source": "sroie",
	"doc_id": basename,
	"raw_text": raw_text,
	"original_labels": original_labels,
	})

	except Exception as e:
	print(f" [SKIP] {basename}: {e}")
	continue

	print(f" Loaded {len(documents)} SROIE documents total")

	if max_docs:
	documents = documents[:max_docs]
	print(f" Trimmed to {max_docs} documents")

	return documents


	def load_cord_data(max_docs: Optional[int] = None) -> List[dict]:
	"""
	Load CORD v1 dataset from HuggingFace.
	Downloads automatically on first use.
	"""
	try:
	from datasets import load_dataset
	except ImportError:
	print(" [ERROR] 'datasets' library not installed. Run: pip install datasets")
	return []

	print(" Downloading CORD v1 from HuggingFace...")
	try:
	dataset = load_dataset("naver-clova-ix/cord-v1", split="train")
	except Exception as e:
	print(f" [ERROR] Failed to load CORD: {e}")
	return []

	print(f" Loaded {len(dataset)} CORD receipts")

	documents = []
	limit = max_docs if max_docs else len(dataset)

	for i, sample in enumerate(dataset):
	if i >= limit:
	break

	try:
	# CORD stores parsed data in 'ground_truth' field as JSON string
	gt_str = sample.get("ground_truth", "")
	if isinstance(gt_str, str):
	gt_data = json.loads(gt_str)
	else:
	gt_data = gt_str

	# Extract text from the ground truth parse
	# CORD gt_parse has nested structure with menu items, totals, etc.
	gt_parse = gt_data.get("gt_parse", gt_data)

	# Reconstruct raw text from the parsed fields
	text_parts = []

	# Store name
	store_info = gt_parse.get("store_info", {})
	if store_info:
	for key in ["name", "branch", "address", "tel"]:
	val = store_info.get(key, "")
	if val:
	text_parts.append(str(val))

	# Menu items
	menu = gt_parse.get("menu", [])
	if menu:
	text_parts.append("\n--- ITEMS ---")
	for item in menu:
	name = item.get("nm", "")
	cnt = item.get("cnt", "")
	price = item.get("price", "")
	sub_items = item.get("sub", [])

	line = f" {name}"
	if cnt:
	line += f" x{cnt}"
	if price:
	line += f" {price}"
	text_parts.append(line)

	for sub in (sub_items or []):
	sub_name = sub.get("nm", "")
	sub_price = sub.get("price", "")
	text_parts.append(f" - {sub_name} {sub_price}")

	# Totals
	total_info = gt_parse.get("total", {})
	if total_info:
	text_parts.append("\n--- TOTALS ---")
	for key in ["subtotal_price", "tax_price", "total_price",
	"discount_price", "service_price"]:
	val = total_info.get(key, "")
	if val:
	label = key.replace("_", " ").title()
	text_parts.append(f" {label}: {val}")

	# Payment
	pay_info = gt_parse.get("payment", {})
	if pay_info:
	text_parts.append("\n--- PAYMENT ---")
	for key in ["cash_price", "change_price", "credit_card_price"]:
	val = pay_info.get(key, "")
	if val:
	label = key.replace("_", " ").title()
	text_parts.append(f" {label}: {val}")

	raw_text = "\n".join(text_parts)

	if len(raw_text.strip()) < 10:
	continue

	documents.append({
	"source": "cord",
	"doc_id": f"cord_{i}",
	"raw_text": raw_text,
	"original_labels": gt_parse,
	})

	except Exception as e:
	continue

	print(f" Processed {len(documents)} CORD documents")
	return documents


	def label_with_gpt(
	documents: List[dict],
	model: str = "gpt-4o-mini",
	batch_delay: float = 0.15,
	) -> List[dict]:
	"""
	Use GPT-4o-mini to generate ground truth labels in our schema.

	Args:
	documents: List with 'raw_text' field.
	model: OpenAI model to use.
	batch_delay: Delay between API calls (rate limiting).

	Returns:
	List of labeled documents in our training format.
	"""
	try:
	from openai import OpenAI
	except ImportError:
	print(" [ERROR] 'openai' library not installed. Run: pip install openai")
	return []

	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key or api_key.startswith("sk-your"):
	print("\n ❌ Error: OPENAI_API_KEY not set in .env!")
	return []

	client = OpenAI(api_key=api_key)

	labeled = []
	success = 0
	failed = 0

	print(f"\n Labeling {len(documents)} documents with {model}...")

	for i, doc in enumerate(documents):
	raw_text = doc["raw_text"]

	user_msg = f"Extract structured data from this financial document:\n\n---\n{raw_text}\n---"

	print(f" [{i+1}/{len(documents)}] {doc.get('source', '?')}/{doc.get('doc_id', '?')}...", end="")

	for attempt in range(3):
	try:
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": LABELING_PROMPT},
	{"role": "user", "content": user_msg},
	],
	temperature=0.1,
	max_tokens=2048,
	response_format={"type": "json_object"},
	)

	raw_output = response.choices[0].message.content
	parsed = json.loads(raw_output)

	# Basic validation
	if "common" not in parsed:
	raise ValueError("Missing 'common' field")
	if "document_type" not in parsed.get("common", {}):
	raise ValueError("Missing 'document_type'")
	if "flags" not in parsed:
	parsed["flags"] = []
	if "confidence_score" not in parsed:
	parsed["confidence_score"] = 0.85

	# Build training document in our format
	labeled.append({
	"doc_type": parsed["common"]["document_type"],
	"raw_text": raw_text,
	"ground_truth": parsed,
	"source": doc.get("source", "unknown"),
	})

	num_flags = len(parsed.get("flags", []))
	print(f" ✅ ({num_flags} flags)")
	success += 1
	break

	except json.JSONDecodeError as e:
	print(f" [RETRY {attempt+1}]", end="")
	except Exception as e:
	print(f" [RETRY {attempt+1}: {str(e)[:50]}]", end="")

	time.sleep(0.5)
	else:
	print(f" ❌ failed")
	failed += 1

	time.sleep(batch_delay)

	print(f"\n Labeling complete: {success} success, {failed} failed")

	# Cost estimate
	est_input = success * 1500 / 1_000_000
	est_output = success * 700 / 1_000_000
	est_cost = est_input * 0.15 + est_output * 0.60
	print(f" Estimated cost: ~${est_cost:.3f}")

	return labeled


	def merge_datasets(output_path: str):
	"""
	Merge all data sources into one master JSONL.

	Sources:
	- data/training/synthetic_raw.jsonl (our Faker-generated docs)
	- data/training/real_sroie.jsonl (SROIE receipts, GPT-labeled)
	- data/training/real_cord.jsonl (CORD receipts, GPT-labeled)
	"""
	sources = [
	("data/training/synthetic_raw.jsonl", "synthetic"),
	("data/training/real_sroie.jsonl", "sroie"),
	("data/training/real_cord.jsonl", "cord"),
	]

	all_docs = []

	for filepath, source_name in sources:
	if not os.path.exists(filepath):
	print(f" [SKIP] {filepath} not found")
	continue

	count = 0
	with open(filepath, "r", encoding="utf-8") as f:
	for line in f:
	doc = json.loads(line.strip())
	if "source" not in doc:
	doc["source"] = source_name
	all_docs.append(doc)
	count += 1

	print(f" Loaded {count} docs from {source_name}")

	if not all_docs:
	print(" ❌ No documents found to merge!")
	return

	# Save merged file
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	with open(output_path, "w", encoding="utf-8") as f:
	for doc in all_docs:
	f.write(json.dumps(doc, ensure_ascii=False) + "\n")

	# Print distribution
	source_counts = {}
	type_counts = {}
	for doc in all_docs:
	s = doc.get("source", "unknown")
	t = doc.get("doc_type", "unknown")
	source_counts[s] = source_counts.get(s, 0) + 1
	type_counts[t] = type_counts.get(t, 0) + 1

	print(f"\n Merged Dataset Summary:")
	print(f" {'─' * 40}")
	print(f" By Source:")
	for src, cnt in sorted(source_counts.items()):
	print(f" {src:<20}: {cnt}")
	print(f" By Document Type:")
	for dtype, cnt in sorted(type_counts.items()):
	print(f" {dtype:<20}: {cnt}")
	print(f" {'─' * 40}")
	print(f" Total: {len(all_docs)} documents")
	print(f" Saved to: {output_path}")


	def main():
	parser = argparse.ArgumentParser(
	description="Ingest real-world financial datasets and generate training labels"
	)
	parser.add_argument("--source", type=str, required=True,
	choices=["sroie", "cord", "merge"],
	help="Data source to process")
	parser.add_argument("--sroie-path", type=str, default="data/raw/sroie",
	help="Path to downloaded SROIE dataset")
	parser.add_argument("--output", type=str, default=None,
	help="Output JSONL file path")
	parser.add_argument("--max-docs", type=int, default=None,
	help="Maximum documents to process")
	parser.add_argument("--model", type=str, default="gpt-4o-mini",
	help="OpenAI model for labeling")
	args = parser.parse_args()

	print(f"\n{'='*50}")
	print(f" Real-World Dataset Ingestion Pipeline")
	print(f"{'='*50}\n")

	if args.source == "sroie":
	output = args.output or "data/training/real_sroie.jsonl"
	print(f" Source: SROIE v2 ({args.sroie_path})")
	print(f" Output: {output}\n")

	docs = load_sroie_data(args.sroie_path, args.max_docs)
	if docs:
	labeled = label_with_gpt(docs, model=args.model)
	if labeled:
	os.makedirs(os.path.dirname(output), exist_ok=True)
	with open(output, "w", encoding="utf-8") as f:
	for doc in labeled:
	f.write(json.dumps(doc, ensure_ascii=False) + "\n")
	print(f"\n ✅ Saved {len(labeled)} labeled documents to {output}")

	elif args.source == "cord":
	output = args.output or "data/training/real_cord.jsonl"
	print(f" Source: CORD v1 (HuggingFace)")
	print(f" Output: {output}\n")

	docs = load_cord_data(args.max_docs)
	if docs:
	labeled = label_with_gpt(docs, model=args.model)
	if labeled:
	os.makedirs(os.path.dirname(output), exist_ok=True)
	with open(output, "w", encoding="utf-8") as f:
	for doc in labeled:
	f.write(json.dumps(doc, ensure_ascii=False) + "\n")
	print(f"\n ✅ Saved {len(labeled)} labeled documents to {output}")

	elif args.source == "merge":
	output = args.output or "data/training/merged_raw.jsonl"
	print(f" Merging all data sources...")
	print(f" Output: {output}\n")
	merge_datasets(output)

	print()


	if __name__ == "__main__":
	main()