|
|
|
|
|
""" |
|
|
Convert JSON data to DASCO pkl format for training. |
|
|
Splits data into train/dev/test and converts to required format. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import pickle |
|
|
import os |
|
|
import numpy as np |
|
|
import stanza |
|
|
import argparse |
|
|
import torch |
|
|
from tqdm import tqdm |
|
|
from sklearn.model_selection import train_test_split |
|
|
import warnings |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
POLARITY_MAP = { |
|
|
"Positive": "POS", |
|
|
"Negative": "NEG", |
|
|
"Neutral": "NEU", |
|
|
"Unknown": "NEU" |
|
|
} |
|
|
|
|
|
|
|
|
def run_dependency_parsing(text: str, nlp) -> dict: |
|
|
"""Run Stanza dependency parsing.""" |
|
|
try: |
|
|
doc = nlp(text) |
|
|
tokens = [] |
|
|
postag = [] |
|
|
edges = [] |
|
|
deprels = [] |
|
|
|
|
|
for sent in doc.sentences: |
|
|
for word in sent.words: |
|
|
tokens.append(word.text) |
|
|
postag.append(word.upos) |
|
|
edges.append(word.head) |
|
|
deprels.append(word.deprel) |
|
|
|
|
|
return { |
|
|
"token": tokens, |
|
|
"postag": postag, |
|
|
"edges": edges, |
|
|
"deprels": deprels |
|
|
} |
|
|
except Exception as e: |
|
|
print(f"Parsing error: {e}") |
|
|
|
|
|
tokens = text.split() |
|
|
n = len(tokens) |
|
|
return { |
|
|
"token": tokens, |
|
|
"postag": ["NOUN"] * n, |
|
|
"edges": [0] + list(range(1, n)), |
|
|
"deprels": ["root"] + ["dep"] * (n - 1) if n > 0 else [] |
|
|
} |
|
|
|
|
|
|
|
|
def create_scope_offsets(aspect_from: int, aspect_to: int, text_len: int, window: int = 3) -> list: |
|
|
""" |
|
|
Create scope as [left_offset, right_offset]. |
|
|
ParseData expects: aspect_scope = [id_b - s_b, s_e - id_e] |
|
|
So we need to return [start_pos, end_pos] where: |
|
|
- start_pos = aspect_from - left_offset |
|
|
- end_pos = aspect_to + right_offset |
|
|
""" |
|
|
left_offset = min(window, aspect_from) |
|
|
right_offset = min(window, text_len - 1 - aspect_to) if text_len > 0 else 0 |
|
|
|
|
|
start_pos = aspect_from - left_offset |
|
|
end_pos = aspect_to + right_offset |
|
|
return [start_pos, end_pos] |
|
|
|
|
|
|
|
|
def convert_sample(sample: dict, image_features: dict, nlp) -> dict: |
|
|
"""Convert single JSON sample to DASCO pkl format.""" |
|
|
|
|
|
|
|
|
url = sample.get("photo_url", "") |
|
|
if url in image_features: |
|
|
image_feature = image_features[url]["features"] |
|
|
else: |
|
|
print(f"Warning: Image not found in cache: {url[:50]}...") |
|
|
image_feature = np.zeros((257, 1408), dtype=np.float32) |
|
|
|
|
|
|
|
|
scene_graph = sample.get("photo_caption", "") |
|
|
|
|
|
|
|
|
text = sample.get("review", "") |
|
|
|
|
|
|
|
|
parse_result = run_dependency_parsing(text, nlp) |
|
|
text_tokens = parse_result["token"] |
|
|
text_len = len(text_tokens) |
|
|
|
|
|
|
|
|
aspects = [] |
|
|
review_aspects = sample.get("review_aspects", []) |
|
|
opinion_categories = sample.get("review_opinion_categories", []) |
|
|
|
|
|
for i, aspect in enumerate(review_aspects): |
|
|
polarity = POLARITY_MAP.get( |
|
|
opinion_categories[i] if i < len(opinion_categories) else "Unknown", |
|
|
"NEU" |
|
|
) |
|
|
|
|
|
|
|
|
aspect_from = aspect["from"] |
|
|
aspect_to = aspect["to"] |
|
|
|
|
|
|
|
|
aspect_from = max(0, min(aspect_from, text_len - 1)) if text_len > 0 else 0 |
|
|
aspect_to = max(0, min(aspect_to, text_len - 1)) if text_len > 0 else 0 |
|
|
|
|
|
|
|
|
scope = create_scope_offsets(aspect_from, aspect_to, text_len) |
|
|
|
|
|
aspects.append({ |
|
|
"term": aspect["term"], |
|
|
"from": aspect_from, |
|
|
"to": aspect_to, |
|
|
"polarity": polarity, |
|
|
"scope": scope |
|
|
}) |
|
|
|
|
|
|
|
|
nouns = [] |
|
|
for aspect in aspects: |
|
|
nouns.append({ |
|
|
"term": aspect["term"], |
|
|
"from": aspect["from"], |
|
|
"to": aspect["to"], |
|
|
"scope": aspect["scope"] |
|
|
}) |
|
|
|
|
|
|
|
|
query_input = "Extract aspect terms and their sentiment polarity" |
|
|
|
|
|
|
|
|
target = [asp["term"] for asp in aspects] if aspects else [text] |
|
|
|
|
|
|
|
|
parse_info = { |
|
|
"token": parse_result["token"], |
|
|
"postag": parse_result["postag"], |
|
|
"edges": parse_result["edges"], |
|
|
"deprels": parse_result["deprels"], |
|
|
"aspects": [{ |
|
|
"term": asp["term"], |
|
|
"from": asp["from"], |
|
|
"to": asp["to"], |
|
|
"polarity": asp["polarity"], |
|
|
"scope": asp["scope"] |
|
|
} for asp in aspects] |
|
|
} |
|
|
|
|
|
return { |
|
|
"image_feature": image_feature, |
|
|
"query_input": query_input, |
|
|
"scene_graph": scene_graph, |
|
|
"text_input": text, |
|
|
"target": target, |
|
|
"nouns": nouns, |
|
|
"parse_info": parse_info |
|
|
} |
|
|
|
|
|
|
|
|
def save_as_pkl(data: list, output_dir: str, batch_size: int = 100): |
|
|
"""Save data as pkl files in batches.""" |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
num_batches = (len(data) + batch_size - 1) // batch_size |
|
|
|
|
|
for i in range(0, len(data), batch_size): |
|
|
batch = data[i:i + batch_size] |
|
|
batch_idx = i // batch_size |
|
|
output_path = os.path.join(output_dir, f"data_{batch_idx}.pkl") |
|
|
with open(output_path, 'wb') as f: |
|
|
pickle.dump(batch, f) |
|
|
|
|
|
print(f"Saved {len(data)} samples to {output_dir} ({num_batches} files)") |
|
|
|
|
|
|
|
|
def main(args): |
|
|
|
|
|
print(f"Loading image cache from {args.image_cache}...") |
|
|
with open(args.image_cache, 'rb') as f: |
|
|
image_features = pickle.load(f) |
|
|
print(f"✓ Loaded {len(image_features)} cached images") |
|
|
|
|
|
|
|
|
print("Initializing Stanza NLP pipeline...") |
|
|
use_gpu = torch.cuda.is_available() |
|
|
try: |
|
|
stanza.download('en', verbose=False) |
|
|
except: |
|
|
pass |
|
|
nlp = stanza.Pipeline( |
|
|
'en', |
|
|
processors='tokenize,pos,lemma,depparse', |
|
|
verbose=False, |
|
|
use_gpu=use_gpu |
|
|
) |
|
|
print(f"✓ Stanza initialized (GPU: {use_gpu})") |
|
|
|
|
|
|
|
|
print(f"Loading {args.input}...") |
|
|
with open(args.input, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
print(f"✓ Loaded {len(data)} samples") |
|
|
|
|
|
|
|
|
print("Filtering samples with cached images...") |
|
|
valid_data = [s for s in tqdm(data, desc="Filtering") if s.get("photo_url") in image_features] |
|
|
print(f"✓ Valid samples: {len(valid_data)} / {len(data)}") |
|
|
|
|
|
if len(valid_data) == 0: |
|
|
print("ERROR: No valid samples found! Check image cache.") |
|
|
return |
|
|
|
|
|
|
|
|
train_data, temp_data = train_test_split( |
|
|
valid_data, |
|
|
test_size=(args.val_ratio + args.test_ratio), |
|
|
random_state=42 |
|
|
) |
|
|
val_ratio_adjusted = args.val_ratio / (args.val_ratio + args.test_ratio) |
|
|
val_data, test_data = train_test_split( |
|
|
temp_data, |
|
|
test_size=(1 - val_ratio_adjusted), |
|
|
random_state=42 |
|
|
) |
|
|
|
|
|
print(f"Split: Train={len(train_data)}, Val={len(val_data)}, Test={len(test_data)}") |
|
|
|
|
|
|
|
|
for split_name, split_data in [("train", train_data), ("dev", val_data), ("test", test_data)]: |
|
|
print(f"\n{'='*50}") |
|
|
print(f"Converting {split_name} set ({len(split_data)} samples)...") |
|
|
converted = [] |
|
|
errors = 0 |
|
|
pbar = tqdm(split_data, desc=f"Converting {split_name}", unit="sample") |
|
|
for sample in pbar: |
|
|
try: |
|
|
result = convert_sample(sample, image_features, nlp) |
|
|
converted.append(result) |
|
|
pbar.set_postfix({"converted": len(converted), "errors": errors}) |
|
|
except Exception as e: |
|
|
errors += 1 |
|
|
pbar.set_postfix({"converted": len(converted), "errors": errors}) |
|
|
continue |
|
|
|
|
|
output_dir = os.path.join(args.output, split_name) |
|
|
save_as_pkl(converted, output_dir, args.batch_size) |
|
|
print(f"✓ {split_name}: {len(converted)} samples saved ({errors} errors)") |
|
|
|
|
|
print("\n=== Conversion Complete ===") |
|
|
print(f"Output directory: {args.output}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser(description="Convert JSON to DASCO pkl format") |
|
|
parser.add_argument("--input", type=str, default="./data/text_image_dataset.json", |
|
|
help="Input JSON file") |
|
|
parser.add_argument("--image_cache", type=str, default="./image_cache/features.pkl", |
|
|
help="Image features cache file") |
|
|
parser.add_argument("--output", type=str, default="./finetune_dataset/custom", |
|
|
help="Output directory") |
|
|
parser.add_argument("--train_ratio", type=float, default=0.8) |
|
|
parser.add_argument("--val_ratio", type=float, default=0.1) |
|
|
parser.add_argument("--test_ratio", type=float, default=0.1) |
|
|
parser.add_argument("--batch_size", type=int, default=100, |
|
|
help="Samples per pkl file") |
|
|
args = parser.parse_args() |
|
|
main(args) |
|
|
|