Spaces:
Sleeping
Sleeping
File size: 51,328 Bytes
398a289 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 | from __future__ import annotations
import argparse
import concurrent.futures
import csv
import difflib
import hashlib
import json
import logging
import os
import random
import re
import threading
import time
from dataclasses import dataclass
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
# Initialize clients for round-robin usage
MEGALLM_BASE_URL = os.getenv("MEGALLM_BASE_URL", "https://ai.megallm.io/v1")
MEGALLM_MODEL = os.getenv("MEGALLM_MODEL", "deepseek-r1-distill-llama-70b")
# Collect keys from env (support MEGALLM_API_KEY and MEGALLM_API_KEY_2)
api_keys = []
if k1 := os.getenv("MEGALLM_API_KEY"):
api_keys.append(k1)
if k2 := os.getenv("MEGALLM_API_KEY_2"):
api_keys.append(k2)
# Allow comma-separated keys in MEGALLM_API_KEY as well
if not api_keys and (keys_str := os.getenv("MEGALLM_API_KEY")):
api_keys = [k.strip() for k in keys_str.split(",") if k.strip()]
if not api_keys:
logging.warning("No API keys found in env! Expect failures.")
api_keys = [""] # Fallback to empty to allow init (will fail on generate)
clients = [OpenAI(base_url=MEGALLM_BASE_URL, api_key=key) for key in api_keys]
logging.info(f"Loaded {len(clients)} API key(s). Workers will match key count.")
def llm_generate(
prompt: str,
json_mode: bool = False,
system_prompt: str = "You are a careful finance data generator.",
) -> str:
"""
Generate text using LLM.
If json_mode=True, enforces JSON output format and parses it.
Returns the text string (or JSON string). Returns "" on failure.
"""
max_retries = 6
temperature = 0.2 if not json_mode else 0.1
max_tokens = 2048
if json_mode:
prompt += (
"\n\nReturn a valid JSON object with the key 'claim'. "
'Example: {"claim": "Your generated claim here"}. '
"Do NOT return any other text, markdown, or explanations."
)
for attempt in range(max_retries):
try:
# Round-robin or random selection of client to distribute load
client = random.choice(clients)
resp = client.chat.completions.create(
model=MEGALLM_MODEL,
messages=[
{
"role": "system",
"content": (
f"{system_prompt} "
"Follow instructions exactly. "
+ (
"Return valid JSON only."
if json_mode
else "Return ONLY the requested text."
)
),
},
{"role": "user", "content": prompt},
],
temperature=temperature,
max_tokens=max_tokens,
timeout=90.0, # Client-side timeout (increased)
# response_format={"type": "json_object"} if json_mode else None # distinct API might not support
)
text = (resp.choices[0].message.content or "").strip()
# Filters <think> tags from models like DeepSeek
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
if json_mode:
# Try to find JSON blob if wrapped in markdown code blocks
match = re.search(r"\{.*\}", text, re.DOTALL)
if match:
json_str = match.group(0)
else:
json_str = text
# Parse to verify
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
# If LLM returns multiple JSONs or trailing text
if "Extra data" in str(e):
try:
data = json.loads(json_str[: e.pos])
except Exception:
logging.error(
f"JSON parsing failed even after recovery: {e}"
)
raise e
else:
raise e
return data.get("claim", "")
return text
except Exception as e:
logging.error(
f"LLM API call failed (attempt {attempt + 1}/{max_retries}): {str(e)}"
)
time.sleep(min(60.0, (2**attempt) + random.random()))
logging.error("LLM generation failed after all retries")
return "" # fallback if all retries fail
def validate_claim(
claim: str,
evidence: str,
seen_hashes: set,
lock: threading.Lock = None,
check_similarity: bool = True,
) -> bool:
"""
Quality Gate:
1. Length check (8-60 words to allow for complex financial claims)
2. No leakage ("Evidence:", "Because...")
3. JSON/Format artifacts check
4. Similarity check against evidence (Jaccard/Ratio)
5. Deduplication check
"""
if not claim:
return False
# Cleaning
claim = _norm_ws(claim)
# 1. Length Gate (Strict < 50 words)
words = claim.split()
if len(words) < 5 or len(words) > 50:
logging.warning(f"Rejecting claim (length {len(words)}): {claim}")
return False
forbidden = [
"evidence:",
# "based on", # Too strict
# "according to", # Too strict
# "this claim", # Too strict
"false because",
"true because",
"is supported by",
"contradicts",
"return only",
"json",
]
lower_claim = claim.lower()
for bad in forbidden:
if bad in lower_claim:
logging.warning(f"Rejecting claim (leakage '{bad}'): {claim}")
return False
# 3. Format Gate (simple heuristics for lists/bullets)
if claim.startswith("- ") or claim.startswith("* ") or claim.startswith("1. "):
logging.warning(f"Rejecting claim (format): {claim}")
return False
# 4. Similarity Gate (prevent copying evidence verbatim)
if check_similarity:
# SequenceMatcher is expensive for large sets, but fine for 1-vs-1
ratio = difflib.SequenceMatcher(None, claim, evidence).ratio()
if ratio > 0.85: # Too similar
logging.warning(f"Rejecting claim (similarity {ratio:.2f}): {claim}")
return False
# 5. Deduplication (Atomic with Lock)
h = hashlib.md5(claim.encode("utf-8")).hexdigest()
if lock:
with lock:
if h in seen_hashes:
logging.warning(f"Rejecting claim (duplicate): {claim}")
return False
seen_hashes.add(h)
else:
# Fallback for single-thread (or unsafe)
if h in seen_hashes:
logging.warning(f"Rejecting claim (duplicate): {claim}")
return False
seen_hashes.add(h)
return True
def check_entailment(claim: str, evidence: str, expected_label: str) -> bool:
"""
Drift Check: Use LLM to verify if claim matches expected label given evidence.
"""
prompt = (
"Verify the relationship between the CLAIM and the EVIDENCE.\n"
f"EVIDENCE: {evidence}\n"
f"CLAIM: {claim}\n\n"
"Does the evidence SUPPORT or CONTRADICT the claim?\n"
"Return ONLY one word: SUPPORTED or CONTRADICTED."
)
result = llm_generate(prompt, json_mode=False).upper().strip()
# Map result to label
if "SUPPORT" in result:
predicted = "true"
else:
predicted = "false"
matches = predicted == expected_label
if not matches:
logging.warning(
f"Drift check failed! Expected {expected_label}, got {predicted} ({result}).\nClaim: {claim}"
)
return matches
def llm_paraphrase_evidence(evidence: str) -> str:
"""
Use LLM to paraphrase evidence while preserving meaning.
Returns original if LLM fails.
"""
prompt = (
"Rewrite the following evidence paragraph in natural, plain English. "
"Keep the same meaning and all key facts. Do not add any new information.\n\n"
f"{evidence}"
)
result = llm_generate(prompt)
return _norm_ws(result) if result else evidence
def llm_paraphrase_claim(
claim: str,
evidence: str,
expected_label: str,
seen_hashes: set,
lock: threading.Lock,
) -> str:
"""
Use LLM to paraphrase claim + drift check + quality gate.
"""
prompt = (
"Rephrase this claim to be a concise financial news headline or statement (under 40 words). "
"Keep the meaning EXACTLY the same. Do not add facts.\n\n"
"Examples:\n"
"Original: The company decided to reduce its workforce by 10% to cut costs.\n"
"Rephrased: Firm announces 10% staff reduction to improve efficiency.\n\n"
f"Original: {claim}"
)
# Try up to 2 times
for _ in range(2):
# Disable json_mode for simpler text return
new_claim = llm_generate(prompt, json_mode=False)
if not new_claim:
continue
# Quality Gate (Pass lock for atomic dedup)
if not validate_claim(
new_claim, evidence, seen_hashes, lock, check_similarity=True
):
continue
# Drift Check
if not check_entailment(new_claim, evidence, expected_label):
continue
return new_claim
logging.warning("Paraphrase failed quality/drift check. Keeping original.")
return claim
def make_hard_contradiction(
evidence: str,
base_concept: str,
rng: random.Random,
seen_hashes: set,
lock: threading.Lock,
) -> str:
"""
Create a claim that clearly contradicts the evidence.
"""
prompt = (
"Based on this evidence, generate a single short sentence claim (under 40 words) that DIRECTLY CONTRADICTS the evidence. "
"The claim must be false given the evidence.\n\n"
"Examples:\n"
"Evidence: Inflation reduces purchasing power.\n"
"Claim: Inflation increases the value of money over time.\n\n"
f"Evidence: {evidence}\n"
f"Topic: {base_concept}\n"
)
for _ in range(3):
claim = llm_generate(prompt, json_mode=False)
if validate_claim(claim, evidence, seen_hashes, lock, check_similarity=False):
return claim
return f"The opposite of the evidence is true regarding {base_concept}."
def make_hard_unsupported(
evidence: str,
base_concept: str,
rng: random.Random,
seen_hashes: set,
lock: threading.Lock,
) -> str:
"""
Create a claim that adds unsupported absolute conditions, specific numbers, or causal leaps.
"""
patterns = [
"absolute assertion (e.g., 'always', 'guarantees', 'never')",
"specific fake number (e.g., 'exactly 2.3%', 'within 48 hours')",
"hidden condition (e.g., 'even if market crashes')",
"unmentioned entity (e.g., 'approved by SEC/Fed')",
"causal leap (e.g., 'therefore stock must double')",
]
pattern = rng.choice(patterns)
prompt = (
f"Based on this evidence, generate a plausible-sounding but UNSUPPORTED claim (under 40 words).\n"
f"Style: {pattern}.\n"
"Examples:\n"
"Evidence: The company reported a 5% increase in revenue.\n"
"Claim: The company is guaranteed to double its revenue next year due to secret government contracts.\n\n"
f"Evidence: {evidence}\n"
"The claim should use concepts from the evidence but add specific details/absolutes NOT found in the text."
)
for _ in range(3):
claim = llm_generate(prompt, json_mode=False)
if validate_claim(claim, evidence, seen_hashes, lock, check_similarity=True):
return claim
return f"This concept regarding {base_concept} is guaranteed to yield 100% returns."
def make_hard_true(
evidence: str,
base_concept: str,
rng: random.Random,
seen_hashes: set,
lock: threading.Lock,
) -> str:
"""
generate a complex TRUE claim (entailed by evidence) to balance the hard set.
"""
patterns = [
"paraphrase with complex syntax",
"infer text using synonyms",
"summarize the key point",
]
pattern = rng.choice(patterns)
prompt = (
f"Generate a single sentence claim (under 40 words) that is FULLY SUPPORTED by the evidence.\n"
f"Style: {pattern}.\n"
"Examples:\n"
"Evidence: Bond prices fall when interest rates rise.\n"
"Claim: An increase in interest rates typically leads to a decline in bond prices.\n\n"
f"Evidence: {evidence}\n"
)
for _ in range(3):
claim = llm_generate(prompt, json_mode=False)
if validate_claim(claim, evidence, seen_hashes, lock, check_similarity=True):
return claim
# Fallback to simple extraction
return f"It is true that {base_concept} behaves as described."
# =============================
# 2) Concept bank
# =============================
@dataclass(frozen=True)
class Concept:
topic: str
true_stmt: str # supported by evidence
evidence_core: str
contradict_stmt: str # contradicts evidence
CONCEPTS: List[Concept] = [
Concept(
topic="bonds_rates",
true_stmt="Bond prices typically move inversely to interest rates.",
evidence_core="When market interest rates rise, newly issued bonds tend to offer higher yields. Existing lower-yield bonds become less attractive, so their prices generally fall; when rates fall, bond prices often rise.",
contradict_stmt="Bond prices typically rise when interest rates rise.",
),
Concept(
topic="inflation_pp",
true_stmt="Higher inflation tends to reduce the purchasing power of money over time.",
evidence_core="Inflation is a broad rise in prices. If prices increase while income does not rise proportionally, the same amount of money buys fewer goods and services, reducing purchasing power.",
contradict_stmt="Higher inflation tends to increase the purchasing power of money over time.",
),
Concept(
topic="real_vs_nominal",
true_stmt="Real returns adjust nominal returns for inflation.",
evidence_core="A nominal return is the stated percentage gain without accounting for inflation. A real return reflects purchasing power by adjusting nominal returns for inflation, often approximated as nominal return minus inflation.",
contradict_stmt="Real returns ignore inflation, while nominal returns account for inflation.",
),
Concept(
topic="compounding",
true_stmt="Compounding allows interest to be earned on previously earned interest.",
evidence_core="With compounding, interest is added to the principal, so future interest is calculated on a larger base. Over time, this can cause balances to grow faster than with simple interest.",
contradict_stmt="Compounding prevents interest from being earned on previously earned interest.",
),
Concept(
topic="present_value",
true_stmt="Present value discounts future cash flows to reflect the time value of money.",
evidence_core="Money today can be worth more than the same amount in the future because it can earn returns. Present value converts future cash flows into an equivalent value today using a discount rate.",
contradict_stmt="Present value increases future cash flows so they are larger than today.",
),
Concept(
topic="npv",
true_stmt="A positive net present value (NPV) indicates expected value creation under the chosen assumptions.",
evidence_core="NPV compares the present value of expected future cash inflows to the present value of costs. If discounted inflows exceed costs, the project is expected to add value given the assumptions and discount rate.",
contradict_stmt="A positive NPV indicates expected value destruction under the chosen assumptions.",
),
Concept(
topic="wacc",
true_stmt="Weighted average cost of capital (WACC) reflects a firm's average cost of financing from debt and equity.",
evidence_core="WACC combines the cost of equity and the after-tax cost of debt in proportion to how much the firm uses each source. It is often used as a discount rate for cash flows that reflect the overall firm.",
contradict_stmt="WACC reflects only a firm's dividend payout to shareholders.",
),
Concept(
topic="diversification",
true_stmt="Diversification can reduce portfolio risk by spreading exposure across different assets.",
evidence_core="If assets do not move perfectly together, losses in one position may be offset by gains or smaller losses in another. This can reduce overall portfolio volatility relative to a concentrated position.",
contradict_stmt="Diversification always increases portfolio risk by spreading exposure.",
),
Concept(
topic="idiosyncratic_vs_systematic",
true_stmt="Diversification tends to reduce idiosyncratic risk more than systematic market risk.",
evidence_core="Idiosyncratic risk is specific to a company or asset and can be reduced by holding many different assets. Systematic risk affects broad markets and cannot be eliminated simply by diversifying within the same market.",
contradict_stmt="Diversification eliminates all market-wide risk.",
),
Concept(
topic="risk_return",
true_stmt="Higher expected returns are often associated with higher risk, but higher risk does not guarantee higher realized returns.",
evidence_core="Riskier assets may offer higher expected compensation to investors. However, outcomes vary and taking more risk does not ensure higher realized returns in any particular period.",
contradict_stmt="Higher expected returns are always associated with lower risk.",
),
Concept(
topic="beta",
true_stmt="Beta is commonly used to describe how sensitive an asset is to broad market movements.",
evidence_core="Beta is often interpreted as how much an asset's returns tend to move relative to a market benchmark. A beta above 1 suggests greater sensitivity than the market, while below 1 suggests less.",
contradict_stmt="Beta measures a company's revenue in dollars.",
),
Concept(
topic="volatility",
true_stmt="Volatility measures how much an asset's price or returns fluctuate over time.",
evidence_core="Volatility is commonly measured using the standard deviation of returns. Higher volatility means larger swings in price and is often interpreted as greater uncertainty.",
contradict_stmt="Volatility means an asset's price never changes.",
),
Concept(
topic="bid_ask",
true_stmt="The bid-ask spread is the difference between the best available bid and the best available ask.",
evidence_core="In many markets, the bid is the highest price a buyer is willing to pay and the ask is the lowest price a seller is willing to accept. The spread reflects liquidity, trading costs, and market-making compensation.",
contradict_stmt="The bid-ask spread is the total number of shares outstanding.",
),
Concept(
topic="market_order",
true_stmt="A market order prioritizes immediate execution at the best available price.",
evidence_core="Market orders emphasize execution speed over price certainty. The execution price depends on the order book and available liquidity at the moment the order is filled.",
contradict_stmt="A market order guarantees execution at a pre-set specific price.",
),
Concept(
topic="limit_order",
true_stmt="A limit order sets a maximum buy price or a minimum sell price.",
evidence_core="Limit orders provide price control but may not execute if the market does not reach the limit. A buy limit executes at the limit price or lower, and a sell limit executes at the limit price or higher.",
contradict_stmt="A limit order always executes immediately regardless of the market price.",
),
Concept(
topic="liquidity",
true_stmt="Liquidity reflects how easily an asset can be traded without materially affecting its price.",
evidence_core="Highly liquid markets tend to have many participants and narrow bid-ask spreads, allowing trades with relatively small price impact. Illiquid assets may require accepting a worse price to trade quickly.",
contradict_stmt="Liquidity means it is difficult to trade an asset quickly.",
),
Concept(
topic="market_cap",
true_stmt="Market capitalization is computed as share price times shares outstanding.",
evidence_core="Market cap is a market-based measure of a company's equity value. Because share prices move over time, market cap can also change even if shares outstanding remain constant.",
contradict_stmt="Market capitalization is computed as revenue divided by expenses.",
),
Concept(
topic="enterprise_value",
true_stmt="Enterprise value is a firm value measure that commonly adjusts equity value for debt and cash.",
evidence_core="Enterprise value is often used to approximate the value of the operating business by combining equity value with net debt. Specific formulas vary, but the concept is to reflect total value beyond just equity.",
contradict_stmt="Enterprise value is identical to a company's revenue.",
),
Concept(
topic="pe_ratio",
true_stmt="A price-to-earnings ratio compares a share price to earnings per share.",
evidence_core="The P/E ratio is commonly computed as price per share divided by earnings per share. It is widely used as a valuation multiple, but interpretation depends on growth, risk, and accounting choices.",
contradict_stmt="A P/E ratio compares a company's debt to its cash flow.",
),
Concept(
topic="eps",
true_stmt="Earnings per share (EPS) is net income divided by the weighted average number of shares.",
evidence_core="EPS expresses profit on a per-share basis. Because share counts can change due to issuance or buybacks, EPS often uses a weighted average share count over the reporting period.",
contradict_stmt="EPS is total assets divided by total liabilities.",
),
Concept(
topic="dividends",
true_stmt="Dividends are discretionary distributions a company may pay to shareholders.",
evidence_core="Companies may choose to pay dividends as a way to return value to shareholders, often in cash or additional shares. Dividend policies vary and many companies choose not to pay dividends.",
contradict_stmt="Dividends are mandatory fines that shareholders must pay to the company.",
),
Concept(
topic="buybacks",
true_stmt="Share buybacks reduce shares outstanding and can raise EPS if earnings stay constant.",
evidence_core="When a company repurchases its shares, the share count decreases. If net income is unchanged, dividing by fewer shares results in a higher EPS, though total value depends on buyback price and other factors.",
contradict_stmt="Share buybacks increase shares outstanding.",
),
Concept(
topic="stock_split",
true_stmt="A stock split increases the number of shares while reducing the price per share proportionally.",
evidence_core="In a stock split, each existing share is divided into multiple shares. The price per share adjusts so that the total equity value remains the same immediately after the split, absent market reactions.",
contradict_stmt="A stock split directly creates new company profits.",
),
Concept(
topic="dilution",
true_stmt="Issuing new shares can dilute existing shareholders' ownership percentage.",
evidence_core="When additional shares are issued, the total number of shares increases. If an existing shareholder does not buy additional shares, their ownership percentage can decline.",
contradict_stmt="Issuing new shares always increases each shareholder's ownership percentage.",
),
Concept(
topic="current_ratio",
true_stmt="The current ratio equals current assets divided by current liabilities.",
evidence_core="The current ratio is a liquidity metric comparing resources expected to be converted to cash within a year against obligations due within a year. Interpretation depends on industry and asset quality.",
contradict_stmt="The current ratio equals total revenue divided by total expenses.",
),
Concept(
topic="working_capital",
true_stmt="Working capital is commonly defined as current assets minus current liabilities.",
evidence_core="Working capital reflects short-term financial flexibility. Positive working capital can indicate capacity to cover near-term obligations, while negative working capital may indicate tighter liquidity depending on the business model.",
contradict_stmt="Working capital is total liabilities minus total assets.",
),
Concept(
topic="accrual_vs_cash",
true_stmt="Accrual accounting recognizes revenue when earned rather than when cash is received.",
evidence_core="Under accrual accounting, revenue and expenses are recorded when they are earned or incurred. This can cause accounting profit to differ from cash flow due to timing differences in collections and payments.",
contradict_stmt="Accrual accounting records revenue only when cash is received.",
),
Concept(
topic="profit_vs_cashflow",
true_stmt="A company can report accounting profits while having negative operating cash flow in a period.",
evidence_core="Accrual accounting recognizes revenue and expenses when earned or incurred, not necessarily when cash changes hands. Working capital changes can make operating cash flow diverge from net income.",
contradict_stmt="If a company reports profit, operating cash flow must be positive in the same period.",
),
Concept(
topic="depreciation",
true_stmt="Depreciation allocates the cost of a long-lived asset over its useful life and is typically a non-cash expense.",
evidence_core="Depreciation reduces reported accounting profit but does not represent an immediate cash outflow in the period it is recorded. Cash impact typically occurred when the asset was purchased.",
contradict_stmt="Depreciation is a cash payment made to suppliers every month.",
),
Concept(
topic="amortization",
true_stmt="Amortization spreads the cost of certain intangible assets over time.",
evidence_core="Amortization is an accounting process that allocates the cost of an intangible asset across its useful life. Like depreciation, it is generally a non-cash expense in the period recognized.",
contradict_stmt="Amortization is the interest rate on a credit card.",
),
Concept(
topic="ebitda",
true_stmt="EBITDA is earnings before interest, taxes, depreciation, and amortization.",
evidence_core="EBITDA is a non-GAAP metric often used to approximate operating performance by excluding financing costs, taxes, and certain non-cash charges. It is not the same as cash flow and can omit important costs.",
contradict_stmt="EBITDA is always identical to net cash flow.",
),
Concept(
topic="leverage",
true_stmt="Financial leverage can amplify both gains and losses for equity holders.",
evidence_core="Debt financing introduces fixed obligations such as interest payments. When performance is strong, leverage can increase returns on equity, but when performance weakens, leverage can magnify losses and increase default risk.",
contradict_stmt="Financial leverage always reduces risk for equity holders.",
),
Concept(
topic="credit_risk",
true_stmt="Credit risk is the risk that a borrower will fail to make required payments.",
evidence_core="In lending and bond markets, credit risk refers to the possibility of default or missed payments, which can lead to losses for lenders or bondholders.",
contradict_stmt="Credit risk is the risk that a currency symbol changes.",
),
Concept(
topic="credit_spread",
true_stmt="A credit spread is the yield difference between a riskier bond and a safer benchmark bond.",
evidence_core="Investors may demand extra yield to compensate for default risk and other risks. The difference in yields between similar-maturity bonds with different credit quality is commonly called a credit spread.",
contradict_stmt="A credit spread is the same thing as a company's dividend policy.",
),
Concept(
topic="ytm",
true_stmt="Yield to maturity estimates the annualized return of a bond if held to maturity under standard assumptions.",
evidence_core="Yield to maturity is the rate that discounts a bond's expected cash flows, such as coupons and principal repayment, to its current market price. It typically assumes payments occur as scheduled.",
contradict_stmt="Yield to maturity is the same as a bond's face value.",
),
Concept(
topic="duration",
true_stmt="Duration is a measure of a bond's price sensitivity to changes in yields.",
evidence_core="Duration summarizes how much a bond's price tends to change for a given change in yields. Longer duration generally implies greater price sensitivity to interest rate movements.",
contradict_stmt="Duration is the bond's coupon rate.",
),
Concept(
topic="yield_curve",
true_stmt="A yield curve plots bond yields against different maturities.",
evidence_core="A yield curve shows interest rates or yields for bonds of different maturities, often government bonds. The curve's shape can change over time and is sometimes used to summarize expectations about growth, inflation, and policy.",
contradict_stmt="A yield curve plots a company's quarterly sales.",
),
Concept(
topic="fx_rate",
true_stmt="An exchange rate indicates how much of one currency is needed to buy one unit of another currency.",
evidence_core="Exchange rates enable conversion between currencies. They can move due to many factors such as interest rate differences, inflation expectations, risk sentiment, and capital flows.",
contradict_stmt="An exchange rate is the fee charged on a credit card payment.",
),
Concept(
topic="call_option",
true_stmt="A call option gives the holder the right, but not the obligation, to buy an asset at a specified price.",
evidence_core="Options are derivatives. A call option allows the buyer to purchase the underlying at the strike price by expiration; the holder can let it expire if exercising is unfavorable.",
contradict_stmt="A call option obligates the holder to buy the asset regardless of price.",
),
Concept(
topic="put_option",
true_stmt="A put option gives the holder the right, but not the obligation, to sell an asset at a specified price.",
evidence_core="A put option allows the buyer to sell the underlying at the strike price by expiration. The holder may choose not to exercise if it is not beneficial.",
contradict_stmt="A put option obligates the holder to buy the asset at the strike price.",
),
Concept(
topic="futures",
true_stmt="A futures contract is an agreement to buy or sell an underlying asset at a predetermined price at a future date.",
evidence_core="Futures are standardized contracts often traded on exchanges. They are used for hedging or speculation and typically involve margining that can settle gains and losses over time.",
contradict_stmt="A futures contract is a bank deposit account.",
),
Concept(
topic="hedging",
true_stmt="Hedging uses an offsetting position to reduce exposure to a particular risk.",
evidence_core="A hedge is designed to reduce potential losses from an adverse price movement. Hedging can reduce upside potential and may involve costs such as premiums or transaction fees.",
contradict_stmt="Hedging increases exposure to the same risk.",
),
Concept(
topic="short_selling",
true_stmt="Short selling involves selling borrowed shares with the intention of buying them back later.",
evidence_core="In a typical short sale, shares are borrowed and sold. The short seller profits if the price falls and they can repurchase at a lower price, but losses can grow if the price rises.",
contradict_stmt="Short selling involves buying shares and holding them for decades.",
),
Concept(
topic="etf",
true_stmt="An exchange-traded fund (ETF) holds a basket of assets and trades on an exchange like a stock.",
evidence_core="ETFs often track an index or strategy and can be bought or sold throughout the trading day. Their market price may differ from net asset value depending on liquidity and market conditions.",
contradict_stmt="An ETF is a private loan that cannot be traded.",
),
Concept(
topic="mutual_fund_nav",
true_stmt="Net asset value (NAV) reflects a fund's assets minus liabilities, typically expressed per share.",
evidence_core="NAV is calculated by valuing holdings, subtracting liabilities, and dividing by shares outstanding. Depending on the fund structure, transactions may occur at or around NAV.",
contradict_stmt="NAV is the number of employees working at the fund.",
),
Concept(
topic="central_bank_rates",
true_stmt="Central banks may adjust policy rates to influence borrowing conditions and inflation.",
evidence_core="Policy rates can affect interest rates throughout the economy. Raising rates can discourage borrowing and spending, while lowering rates can encourage them, with implications for inflation and growth.",
contradict_stmt="Central banks change policy rates only to affect corporate branding.",
),
Concept(
topic="audit_assurance",
true_stmt="External audits typically provide reasonable assurance rather than absolute certainty.",
evidence_core="Audits use sampling, testing, and professional judgment to reduce the risk of material misstatement. Because of inherent limitations, audits generally provide reasonable assurance, not a guarantee of perfect accuracy.",
contradict_stmt="An external audit guarantees that financial statements contain no errors.",
),
Concept(
topic="gaap_ifrs",
true_stmt="GAAP and IFRS are accounting standards frameworks used to prepare financial statements.",
evidence_core="Companies prepare financial statements following a set of accounting standards. GAAP and IFRS are two widely used frameworks, and specific rules can differ between them.",
contradict_stmt="GAAP and IFRS are stock market indices.",
),
Concept(
topic="insider_trading",
true_stmt="Insider trading generally refers to trading based on material non-public information.",
evidence_core="Many jurisdictions restrict trading on material information that is not publicly available to reduce unfair informational advantages and protect market integrity.",
contradict_stmt="Insider trading means trading only during lunch breaks.",
),
]
# =============================
# 3) Templates for extra variety
# =============================
TRUE_PREFIXES = [
"",
"In general, ",
"Typically, ",
"In many financial markets, ",
"As a rule of thumb, ",
"A common finance principle is that ",
]
FALSE_PREFIXES = [
"",
"It is correct that ",
"It is a fact that ",
"In all cases, ",
]
UNSUPPORTED_TAILS = [
" in every situation.",
" with no exceptions.",
", regardless of market conditions.",
", and this guarantees profits for investors.",
", as officially confirmed by regulators in every country.",
]
EVIDENCE_FILLERS = [
"This is a general concept often taught in introductory finance.",
"The exact magnitude of the effect can vary with market structure and assumptions.",
"Real-world outcomes can differ depending on timing, liquidity, and investor behavior.",
"Definitions may differ slightly across jurisdictions and reporting standards.",
"These relationships are commonly discussed in basic financial analysis.",
"In practice, multiple factors can influence the observed result.",
]
def _norm_ws(s: str) -> str:
return re.sub(r"\s+", " ", s).strip()
def _strip_period(s: str) -> str:
s = _norm_ws(s)
return s[:-1] if s.endswith(".") else s
def _lc_first(s: str) -> str:
s = _norm_ws(s)
return (s[0].lower() + s[1:]) if s else s
def make_true_claim(c: Concept, rng: random.Random) -> str:
base = _strip_period(c.true_stmt)
prefix = rng.choice(TRUE_PREFIXES)
if prefix in {"A common finance principle is that ", ""}:
stmt = _lc_first(base) if prefix.endswith("that ") else base
out = prefix + stmt
else:
out = prefix + _lc_first(base) if prefix else base
return _norm_ws(out) + "."
def make_false_contradict_claim(c: Concept, rng: random.Random) -> str:
base = _strip_period(c.contradict_stmt)
prefix = rng.choice(FALSE_PREFIXES)
out = prefix + (_lc_first(base) if prefix else base)
return _norm_ws(out) + "."
def make_false_unsupported_claim(c: Concept, rng: random.Random) -> str:
# Unsupported = plausible but evidence does not confirm (extra absolute / extra assertion)
base = _strip_period(c.true_stmt)
prefix = rng.choice(FALSE_PREFIXES)
tail = rng.choice(UNSUPPORTED_TAILS)
out = prefix + (_lc_first(base) if prefix else base)
# Ensure one sentence (avoid double period)
out = _strip_period(_norm_ws(out)) + tail
return _norm_ws(out)
def make_evidence(
c: Concept, rng: random.Random, min_fillers: int, max_fillers: int
) -> str:
core = _norm_ws(c.evidence_core)
k = rng.randint(min_fillers, max_fillers)
chosen = (
rng.sample(EVIDENCE_FILLERS, k=min(k, len(EVIDENCE_FILLERS))) if k > 0 else []
)
evidence = core + (" " + " ".join(chosen) if chosen else "")
return _norm_ws(evidence)
# =============================
# 4) Dataset builder
# =============================
def build_rows(seed: int, checkpoint_path: str = None) -> List[dict]:
"""
Build 1000 samples with controlled quality:
- 600 Controlled (300 True, 300 False)
- Includes ~350 paraphrased
- 400 Hard Set (LLM-generated)
- 200 Hard True (Complex/Tricky)
- 100 Hard Contradiction
- 100 Hard Unsupported (Absolute/Fake stats)
Total: 500 True, 500 False (Perfectly Balanced)
"""
if checkpoint_path:
logging.info(
f"Checkpointing enabled. Saving to {checkpoint_path} every 50 samples."
)
rng = random.Random(seed)
seen_hashes = set()
# Fixed parameters for evidence generation
min_ev_fillers = 2
max_ev_fillers = 5
# Cycle concepts to avoid over-using only a few
order = list(range(len(CONCEPTS)))
rng.shuffle(order)
rows: List[dict] = []
# =============================
# A. Generate 600 Controlled Samples
# =============================
logging.info("=" * 70)
logging.info("Phase 1: Generating 600 controlled samples")
logging.info(" - 300 True / 300 False")
logging.info("=" * 70)
print("Generating 600 controlled samples...")
controlled_samples = []
# 1. Generate Base Samples (Fast, Rule-based)
for i in range(600):
try:
c = CONCEPTS[order[i % len(order)]]
is_true = i < 300
evidence = make_evidence(c, rng, min_ev_fillers, max_ev_fillers)
# Generate claim based on evidence (rule-based)
if is_true:
claim = make_true_claim(c, rng)
label = "true"
else:
# For false samples in controlled set, mix contradiction and unsupported
if rng.random() < 0.55:
claim = make_false_contradict_claim(c, rng)
else:
claim = make_false_unsupported_claim(c, rng)
label = "false"
# Track hash
h = hashlib.md5(claim.encode("utf-8")).hexdigest()
seen_hashes.add(
h
) # Single thread here, no lock needed yet or use lock if desired
# We can use lock to be consistent, but Phase 1 part 1 is sequential.
# However, seen_hashes is shared later.
# Safe to just add since parallel part hasn't started yet.
controlled_samples.append(
{
"claim": claim,
"evidence": evidence,
"label": label,
"type": "controlled",
}
)
except Exception as e:
logging.error(f"Error generating controlled sample {i}: {e}")
# 2. Parallel Paraphrase (350 samples)
# Use max_workers = len(clients) to respect rate limits per key
max_w = len(clients) if len(clients) > 0 else 1
logging.info(f"Paraphrasing 350 samples in parallel using {max_w} workers...")
print(f"Paraphrasing 350 samples in parallel using {max_w} workers...")
paraphrase_indices = set(rng.sample(range(len(controlled_samples)), 350))
hash_lock = threading.Lock()
def process_paraphrase(idx, sample):
try:
org_claim = sample["claim"]
# Pass lock to llm_paraphrase_claim
new_claim = llm_paraphrase_claim(
org_claim,
sample["evidence"],
sample["label"],
seen_hashes,
hash_lock,
)
# Validation/Dedup already handled inside llm_paraphrase_claim
if new_claim != org_claim:
return idx, new_claim, "controlled_paraphrased"
except Exception as e:
logging.error(f"Paraphrase error sample {idx}: {e}")
return idx, None, None
with concurrent.futures.ThreadPoolExecutor(max_workers=max_w) as executor:
futures = {
executor.submit(process_paraphrase, idx, controlled_samples[idx]): idx
for idx in paraphrase_indices
}
completed = 0
for future in concurrent.futures.as_completed(futures):
idx, new_claim, new_type = future.result()
if new_claim:
controlled_samples[idx]["claim"] = new_claim
controlled_samples[idx]["type"] = new_type
completed += 1
if completed % 50 == 0:
print(f" Paraphrased {completed}/350 samples")
logging.info(f" ✓ Paraphrased {completed}/350 samples")
if checkpoint_path:
write_csv(checkpoint_path, rows + controlled_samples)
rows.extend(controlled_samples)
logging.info(f"✓ Phase 1 complete: {len(rows)} samples")
# =============================
# B. Generate 400 Hard Set Samples (Parallel)
# =============================
logging.info("\nPhase 2: Generating 400 hard set samples (Parallel)")
# Use max_workers = len(clients)
max_w = len(clients) if len(clients) > 0 else 1
print(f"\nGenerating 400 hard set samples (Parallel {max_w} workers)...")
# We need: 200 Hard True, 100 Hard Contradiction, 100 Hard Unsupported
tasks = []
# (function, count, type_label, label_val)
tasks.extend([(make_hard_true, "hard_true", "true")] * 200)
tasks.extend([(make_hard_contradiction, "hard_contradiction", "false")] * 100)
tasks.extend([(make_hard_unsupported, "hard_unsupported", "false")] * 100)
# Shuffle tasks so we don't hammer one type first
rng.shuffle(tasks)
# Add index to tasks to allow deterministic seeding per worker
indexed_tasks = list(enumerate(tasks)) # (index, (func, type, label))
generated_hard_samples = []
def process_hard_sample(item):
idx, (func, type_label, label_val) = item
# Create LOCAL RNG using seed + index
local_rng = random.Random(seed + idx + 10000)
# Retry loop inside worker to ensure we get a sample
for _ in range(5):
try:
# Pick random concept using LOCAL rng
c = local_rng.choice(CONCEPTS)
evidence = make_evidence(c, local_rng, min_ev_fillers, max_ev_fillers)
evidence = llm_paraphrase_evidence(evidence)
# Generate claim (pass lock and local_rng)
claim = func(evidence, c.topic, local_rng, seen_hashes, hash_lock)
# Hashes updated inside func->validate_claim (atomic)
return {
"claim": claim,
"evidence": evidence,
"label": label_val,
"type": type_label,
}
except Exception as e:
logging.error(f"Hard sample gen error: {e}")
time.sleep(1)
return None
with concurrent.futures.ThreadPoolExecutor(max_workers=max_w) as executor:
futures = [executor.submit(process_hard_sample, item) for item in indexed_tasks]
completed = 0
for future in concurrent.futures.as_completed(futures):
res = future.result()
if res:
generated_hard_samples.append(res)
completed += 1
if completed % 50 == 0:
print(f" Generated {completed}/{len(tasks)} hard samples")
logging.info(f" ✓ Generated {completed}/{len(tasks)} hard samples")
if checkpoint_path:
write_csv(checkpoint_path, rows + generated_hard_samples)
rows.extend(generated_hard_samples)
# Shuffle
logging.info("\nShuffling all samples...")
rng.shuffle(rows)
logging.info(f"✓ All phases complete: Total {len(rows)} samples")
logging.info("=" * 70)
return rows
def write_csv(path: str, rows: List[dict]) -> None:
import os
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["claim", "evidence", "label", "type"])
w.writeheader()
for r in rows:
w.writerow(r)
def main() -> None:
ap = argparse.ArgumentParser(
description="Generate 1000 high-quality test samples: 600 Controlled + 400 Hard Set"
)
ap.add_argument(
"--seed", type=int, default=42, help="Random seed for reproducibility"
)
ap.add_argument(
"--out",
type=str,
default="./synthetic_finance_1000.csv",
help="Output CSV file path",
)
ap.add_argument(
"--log",
type=str,
default="./gen_data.log",
help="Log file path for detailed logging",
)
args = ap.parse_args()
# Configure logging
log_format = "%(asctime)s - %(levelname)s - %(message)s"
# Create file handler for detailed logging (INFO and above)
file_handler = logging.FileHandler(args.log, mode="w", encoding="utf-8")
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter(log_format))
# Create console handler for errors and warnings only
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING)
console_handler.setFormatter(logging.Formatter(log_format))
# Configure root logger
logging.basicConfig(level=logging.INFO, handlers=[file_handler, console_handler])
print("=" * 70)
print("High-Quality Test Set Generation")
print("=" * 70)
print("Structure: 600 Controlled + 400 Hard Set (Balanced)")
print(f"Seed: {args.seed}")
print(f"Log file: {args.log}")
print("=" * 70)
print()
logging.info("=" * 70)
logging.info("Starting data generation process")
logging.info(f"Seed: {args.seed}")
logging.info(f"Output file: {args.out}")
logging.info(f"Log file: {args.log}")
logging.info("=" * 70)
rows = build_rows(seed=args.seed, checkpoint_path=args.out)
write_csv(args.out, rows)
# Calculate statistics
true_n = sum(1 for r in rows if r["label"] == "true")
false_n = len(rows) - true_n
controlled_pure = sum(1 for r in rows if r.get("type") == "controlled")
controlled_para = sum(1 for r in rows if r.get("type") == "controlled_paraphrased")
hard_true = sum(1 for r in rows if r.get("type") == "hard_true")
hard_contradiction = sum(1 for r in rows if r.get("type") == "hard_contradiction")
hard_unsupported = sum(1 for r in rows if r.get("type") == "hard_unsupported")
print()
print("=" * 70)
print("✓ Test set generation complete")
print("=" * 70)
print(f"Output: {args.out}")
print(f"Total samples: {len(rows)}")
print()
print("Label distribution:")
print(f" - True: {true_n} ({true_n / len(rows) * 100:.1f}%)")
print(f" - False: {false_n} ({false_n / len(rows) * 100:.1f}%)")
print()
print("Sample type distribution:")
print(f" - Controlled (pure rule-based): {controlled_pure}")
print(f" - Controlled (with LLM paraphrase): {controlled_para}")
print(f" - Hard True (LLM): {hard_true}")
print(f" - Hard Contradiction (LLM): {hard_contradiction}")
print(f" - Hard Unsupported (LLM): {hard_unsupported}")
print()
print(f"Concepts used: {len(CONCEPTS)}")
print("Columns: claim, evidence, label, type")
print("=" * 70)
# Log final statistics
logging.info("")
logging.info("=" * 70)
logging.info("✓ Test set generation complete")
logging.info("=" * 70)
logging.info(f"Output file: {args.out}")
logging.info(f"Total samples generated: {len(rows)}")
logging.info("")
logging.info("Label distribution:")
logging.info(f" - True: {true_n} ({true_n / len(rows) * 100:.1f}%)")
logging.info(f" - False: {false_n} ({false_n / len(rows) * 100:.1f}%)")
logging.info("")
logging.info("Sample type distribution:")
logging.info(f" - Controlled (pure rule-based): {controlled_pure}")
logging.info(f" - Controlled (with LLM paraphrase): {controlled_para}")
logging.info(f" - Hard True (LLM): {hard_true}")
logging.info(f" - Hard Contradiction (LLM): {hard_contradiction}")
logging.info(f" - Hard Unsupported (LLM): {hard_unsupported}")
logging.info("")
logging.info(f"Concepts used: {len(CONCEPTS)}")
logging.info("Columns: claim, evidence, label, type")
logging.info("=" * 70)
logging.info("Data generation process completed successfully!")
if __name__ == "__main__":
main()
|