| | import os |
| | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "2" |
| | import argparse |
| | import json |
| | import tqdm |
| | import numpy as np |
| | import pandas as pd |
| | import textstat |
| | import spacy |
| | import torch |
| | from datasets import load_dataset |
| |
|
| | |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.metrics.pairwise import linear_kernel |
| |
|
| | |
| |
|
| |
|
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--lang", type=str, default="pt", help="language code") |
| | args = parser.parse_args() |
| | lang_code = args.lang |
| |
|
| | |
| | nlp = spacy.load(f"{lang_code}_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"]) |
| |
|
| | def get_parse_tree_stats(text): |
| | doc = nlp(text) |
| | depths = [] |
| | for sent in doc.sents: |
| | def walk_tree(node, depth): |
| | if not list(node.children): return depth |
| | return max(walk_tree(child, depth + 1) for child in node.children) |
| | depths.append(walk_tree(sent.root, 1)) |
| | return np.mean(depths) if depths else 0 |
| |
|
| | |
| | print(f"Loading Wikipedia for {lang_code}...") |
| | ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True) |
| | |
| | wiki_list = [item['text'] for item in ds] |
| |
|
| | |
| | print("Chunking Wikipedia...") |
| | wiki_chunks = [] |
| | for text in wiki_list: |
| | paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20] |
| | wiki_chunks.extend(paragraphs) |
| |
|
| | |
| | print("Computing TF-IDF Vectors (this may take a few minutes)...") |
| | vectorizer = TfidfVectorizer( |
| | max_features=50000, |
| | stop_words=None |
| | ) |
| | |
| | chunk_tfidf = vectorizer.fit_transform(wiki_chunks) |
| |
|
| | |
| | with open(f"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_{lang_code}_v1.json", "r") as f: |
| | res = json.load(f) |
| |
|
| | my_target_documents = [] |
| | for item in res: |
| | for key, value in item['diff_label_texts'].items(): |
| | my_target_documents.append({"index": item['index'], "label": key, "text": value}) |
| | root_dir = "/home/mshahidul/readctrl/data/data_annotator_data/tf_idf_anchors" |
| | os.makedirs(root_dir, exist_ok=True) |
| | save_path = f"{root_dir}/crowdsourcing_input_{lang_code}_v1.json" |
| | processed_data = [] |
| | if os.path.exists(save_path): |
| | with open(save_path, "r") as f: |
| | processed_data = json.load(f) |
| | processed_keys = {(d['index'], d['label']) for d in processed_data} |
| |
|
| | |
| | print("Starting TF-IDF Matching Loop...") |
| | for doc in tqdm.tqdm(my_target_documents): |
| | if (doc['index'], doc['label']) in processed_keys: |
| | continue |
| |
|
| | |
| | |
| | doc_tfidf = vectorizer.transform([doc['text']]) |
| | |
| | |
| | cosine_similarities = linear_kernel(doc_tfidf, chunk_tfidf).flatten() |
| | |
| | |
| | top_indices = cosine_similarities.argsort()[:-26:-1] |
| | |
| | doc_len = len(doc['text'].split()) |
| | wiki_anchor = None |
| | best_fallback = None |
| | min_delta = float('inf') |
| |
|
| | for idx in top_indices: |
| | cand_text = wiki_chunks[idx] |
| | cand_len = len(cand_text.split()) |
| | len_diff = abs(cand_len - doc_len) |
| | |
| | if len_diff < min_delta: |
| | min_delta = len_diff |
| | best_fallback = cand_text |
| | |
| | if 0.8 <= (cand_len / doc_len) <= 1.2: |
| | wiki_anchor = cand_text |
| | break |
| | |
| | if not wiki_anchor: |
| | wiki_anchor = best_fallback |
| |
|
| | |
| | doc_metrics = { |
| | "fkgl": textstat.flesch_kincaid_grade(doc['text']), |
| | "word_count": doc_len |
| | } |
| | wiki_metrics = { |
| | "fkgl": textstat.flesch_kincaid_grade(wiki_anchor), |
| | "word_count": len(wiki_anchor.split()) |
| | } |
| | |
| | |
| | processed_data.append({ |
| | "index": doc['index'], |
| | "label": doc['label'], |
| | "original_doc": doc['text'], |
| | "wiki_anchor": wiki_anchor, |
| | "doc_fkgl": doc_metrics['fkgl'], |
| | "wiki_fkgl": wiki_metrics['fkgl'], |
| | "doc_tree_depth": get_parse_tree_stats(doc['text']), |
| | "wiki_tree_depth": get_parse_tree_stats(wiki_anchor), |
| | "fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl'] |
| | }) |
| |
|
| | if len(processed_data) % 20 == 0: |
| | with open(save_path, "w") as f: |
| | json.dump(processed_data, f, indent=2) |
| |
|
| | |
| | with open(save_path, "w") as f: |
| | json.dump(processed_data, f, indent=2) |