| import os |
| |
| |
| os.environ["CUDA_VISIBLE_DEVICES"] = "4" |
| import json |
| import tqdm |
| import numpy as np |
| import pandas as pd |
| import textstat |
| import spacy |
| import torch |
| from sentence_transformers import SentenceTransformer, util |
| from datasets import load_dataset |
|
|
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| |
| model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B').to(device) |
| |
| nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"]) |
|
|
| def get_parse_tree_stats(text): |
| doc = nlp(text) |
| depths = [] |
| for sent in doc.sents: |
| def walk_tree(node, depth): |
| if not list(node.children): return depth |
| return max(walk_tree(child, depth + 1) for child in node.children) |
| depths.append(walk_tree(sent.root, 1)) |
| return np.mean(depths) if depths else 0 |
|
|
| |
| ds = load_dataset("wikimedia/wikipedia", "20231101.en", split='train', streaming=True) |
| |
| wiki_list = [item['text'] for item in ds.take(1000000)] |
|
|
| |
| print("Chunking and Encoding Wikipedia...") |
| wiki_chunks = [] |
| for text in wiki_list: |
| paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20] |
| wiki_chunks.extend(paragraphs) |
|
|
| |
| chunk_embs = model.encode(wiki_chunks, convert_to_tensor=True, show_progress_bar=True).to(device) |
|
|
| |
| with open("/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_v1.json", "r") as f: |
| res = json.load(f) |
|
|
| my_target_documents = [] |
| for item in res: |
| for key, value in item['diff_label_texts'].items(): |
| my_target_documents.append({"index": item['index'], "label": key, "text": value}) |
|
|
| |
| save_path = "/home/mshahidul/readctrl/data/data_annotator_data/crowdsourcing_input_en_v2.json" |
| processed_data = [] |
| if os.path.exists(save_path): |
| with open(save_path, "r") as f: |
| processed_data = json.load(f) |
| processed_keys = {(d['index'], d['label']) for d in processed_data} |
|
|
| |
| print("Starting Matching Loop...") |
| for doc in tqdm.tqdm(my_target_documents): |
| if (doc['index'], doc['label']) in processed_keys: |
| continue |
|
|
| |
| doc_emb = model.encode(doc['text'], convert_to_tensor=True).to(device) |
| doc_len = len(doc['text'].split()) |
| |
| hits = util.semantic_search(doc_emb, chunk_embs, top_k=25)[0] |
| |
| wiki_anchor = None |
| best_fallback = None |
| min_delta = float('inf') |
|
|
| for hit in hits: |
| cand_text = wiki_chunks[hit['corpus_id']] |
| cand_len = len(cand_text.split()) |
| len_diff = abs(cand_len - doc_len) |
| |
| |
| if len_diff < min_delta: |
| min_delta = len_diff |
| best_fallback = cand_text |
| |
| if 0.8 <= (cand_len / doc_len) <= 1.2: |
| wiki_anchor = cand_text |
| break |
| |
| if not wiki_anchor: |
| wiki_anchor = best_fallback |
|
|
| |
| doc_metrics = { |
| "fkgl": textstat.flesch_kincaid_grade(doc['text']), |
| "word_count": doc_len |
| } |
| wiki_metrics = { |
| "fkgl": textstat.flesch_kincaid_grade(wiki_anchor), |
| "word_count": len(wiki_anchor.split()) |
| } |
| |
| |
| processed_data.append({ |
| "index": doc['index'], |
| "label": doc['label'], |
| "original_doc": doc['text'], |
| "wiki_anchor": wiki_anchor, |
| "doc_fkgl": doc_metrics['fkgl'], |
| "wiki_fkgl": wiki_metrics['fkgl'], |
| "doc_tree_depth": get_parse_tree_stats(doc['text']), |
| "wiki_tree_depth": get_parse_tree_stats(wiki_anchor), |
| "fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl'] |
| }) |
|
|
| |
| if len(processed_data) % 20 == 0: |
| with open(save_path, "w") as f: |
| json.dump(processed_data, f, indent=2) |
|
|
| |
| with open(save_path, "w") as f: |
| json.dump(processed_data, f, indent=2) |