import os # Environment Setup # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "4" import json import tqdm import numpy as np import pandas as pd import textstat import spacy import torch from sentence_transformers import SentenceTransformer, util from datasets import load_dataset device = "cuda" if torch.cuda.is_available() else "cpu" # 1. Load Models Efficiently model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B').to(device) # Disable unnecessary components in Spacy to save time/memory nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"]) def get_parse_tree_stats(text): doc = nlp(text) depths = [] for sent in doc.sents: def walk_tree(node, depth): if not list(node.children): return depth return max(walk_tree(child, depth + 1) for child in node.children) depths.append(walk_tree(sent.root, 1)) return np.mean(depths) if depths else 0 # 2. Data Loading ds = load_dataset("wikimedia/wikipedia", "20231101.en", split='train', streaming=True) # Taking a subset for the anchor pool to keep memory manageable wiki_list = [item['text'] for item in ds.take(1000000)] # 3. PRE-PROCESS WIKI ANCHORS (Do this ONCE) print("Chunking and Encoding Wikipedia...") wiki_chunks = [] for text in wiki_list: paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20] wiki_chunks.extend(paragraphs) # Encode all chunks at once and keep on GPU chunk_embs = model.encode(wiki_chunks, convert_to_tensor=True, show_progress_bar=True).to(device) # 4. Load Target Docs with open("/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_v1.json", "r") as f: res = json.load(f) my_target_documents = [] for item in res: for key, value in item['diff_label_texts'].items(): my_target_documents.append({"index": item['index'], "label": key, "text": value}) # Load Progress save_path = "/home/mshahidul/readctrl/data/data_annotator_data/crowdsourcing_input_en_v2.json" processed_data = [] if os.path.exists(save_path): with open(save_path, "r") as f: processed_data = json.load(f) processed_keys = {(d['index'], d['label']) for d in processed_data} # 5. Process with Batching logic where possible print("Starting Matching Loop...") for doc in tqdm.tqdm(my_target_documents): if (doc['index'], doc['label']) in processed_keys: continue # A. Robust Anchor Finding (Optimized) doc_emb = model.encode(doc['text'], convert_to_tensor=True).to(device) doc_len = len(doc['text'].split()) hits = util.semantic_search(doc_emb, chunk_embs, top_k=25)[0] wiki_anchor = None best_fallback = None min_delta = float('inf') for hit in hits: cand_text = wiki_chunks[hit['corpus_id']] cand_len = len(cand_text.split()) len_diff = abs(cand_len - doc_len) # Track fallback while looking for strict match if len_diff < min_delta: min_delta = len_diff best_fallback = cand_text if 0.8 <= (cand_len / doc_len) <= 1.2: wiki_anchor = cand_text break if not wiki_anchor: wiki_anchor = best_fallback # B. Calculate Metrics doc_metrics = { "fkgl": textstat.flesch_kincaid_grade(doc['text']), "word_count": doc_len } wiki_metrics = { "fkgl": textstat.flesch_kincaid_grade(wiki_anchor), "word_count": len(wiki_anchor.split()) } # C. Store results processed_data.append({ "index": doc['index'], "label": doc['label'], "original_doc": doc['text'], "wiki_anchor": wiki_anchor, "doc_fkgl": doc_metrics['fkgl'], "wiki_fkgl": wiki_metrics['fkgl'], "doc_tree_depth": get_parse_tree_stats(doc['text']), "wiki_tree_depth": get_parse_tree_stats(wiki_anchor), "fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl'] }) # Save every 20 to reduce disk I/O overhead if len(processed_data) % 20 == 0: with open(save_path, "w") as f: json.dump(processed_data, f, indent=2) # Final Save with open(save_path, "w") as f: json.dump(processed_data, f, indent=2)