File size: 3,351 Bytes
a19c885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import json
import re
import random
# --- BATCH 12: DEV SET UPDATE ---
# These are distinct from Training Batch 11 but test the same concepts.
batch_12_raw = [
# TESTING INDICATORS (New Contexts)
"The {Empire State manufacturing survey|INDICATOR} plummeted to -11.3 in January.",
"Bond yields fell after the weak {ADP payrolls|INDICATOR} data was released.",
"The {Core Personal Consumption Expenditures|INDICATOR} price index rose 0.3%.",
"Investors ignored the better-than-expected {industrial production|INDICATOR} print.",
"{Existing home sales|INDICATOR} dropped 1.9% as mortgage rates stayed high.",
"The {Conference Board Consumer Confidence Index|INDICATOR} slid to 106.7.",
"A surprise jump in {initial jobless claims|INDICATOR} suggests the labor market is softening.",
# TESTING EVENTS (New Synonyms)
"{Chevron|ORG} announced a $75 billion {share repurchase|EVENT} program.",
"The {trading halt|EVENT} on {New York Community Bancorp|ORG} lasted for an hour.",
"{Trian Partners|ORG} has built a significant {stake|EVENT} in {Allstate|ORG}.",
"The market is entering a technical {correction|EVENT} after the 10% drop.",
"{Reddit|ORG}'s {direct listing|EVENT} is expected to value the firm at $5 billion.",
"A {short squeeze|EVENT} in {Root Inc.|ORG} sent the stock soaring 30%.",
"{Japan|ORG} intervened to stop the {sell-off|EVENT} in the Yen.",
"The {merger|EVENT} talks between {Warner Bros|ORG} and {Paramount|ORG} have stalled."
]
# --- CONVERSION LOGIC ---
def update_dev_file(new_lines, dev_file="dev_financial_ner.json"):
# 1. Load Existing Dev Data
try:
with open(dev_file, "r") as f:
existing_dev = json.load(f)
print(f"Loaded {len(existing_dev)} existing Dev examples.")
except FileNotFoundError:
print(f"Error: Could not find {dev_file}. Make sure you generated the dataset first.")
return
# 2. Convert New Lines to Spacy Format
new_data = []
pattern = re.compile(r"\{(.*?)\|([A-Z]+)\}")
for line in new_lines:
clean_text = ""
entities = []
cursor = 0
last_match_end = 0
for match in pattern.finditer(line):
pre_text = line[last_match_end:match.start()]
clean_text += pre_text
cursor += len(pre_text)
entity_text = match.group(1)
label = match.group(2)
entities.append((cursor, cursor + len(entity_text), label))
clean_text += entity_text
cursor += len(entity_text)
last_match_end = match.end()
clean_text += line[last_match_end:]
if entities:
new_data.append((clean_text, {"entities": entities}))
# 3. Merge (Append only)
# We do NOT shuffle here because we want to ensure these specific tests are included
updated_dev = existing_dev + new_data
# 4. Save Back to File
with open(dev_file, "w") as f:
json.dump(updated_dev, f, indent=2)
print(f"Success! Added {len(new_data)} new examples to the Dev Set.")
print(f"New Dev Set Size: {len(updated_dev)}")
# --- EXECUTE ---
update_dev_file(batch_12_raw) |