File size: 4,706 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import argparse
import json
import tqdm
import numpy as np
import pandas as pd
import textstat
import spacy
import torch
from datasets import load_dataset
# Replacement for SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# Environment Setup
parser = argparse.ArgumentParser()
parser.add_argument("--lang", type=str, default="pt", help="language code")
args = parser.parse_args()
lang_code = args.lang
# Load Spacy
nlp = spacy.load(f"{lang_code}_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"])
def get_parse_tree_stats(text):
doc = nlp(text)
depths = []
for sent in doc.sents:
def walk_tree(node, depth):
if not list(node.children): return depth
return max(walk_tree(child, depth + 1) for child in node.children)
depths.append(walk_tree(sent.root, 1))
return np.mean(depths) if depths else 0
# 1. Data Loading
print(f"Loading Wikipedia for {lang_code}...")
ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
# wiki_list = [item['text'] for item in ds.take(1000000)]
wiki_list = [item['text'] for item in ds]
# 2. PRE-PROCESS WIKI ANCHORS
print("Chunking Wikipedia...")
wiki_chunks = []
for text in wiki_list:
paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
wiki_chunks.extend(paragraphs)
# 3. TF-IDF VECTORIZATION
print("Computing TF-IDF Vectors (this may take a few minutes)...")
vectorizer = TfidfVectorizer(
max_features=50000, # Prevents the matrix from exploding in memory
stop_words=None # You might want to pass a list for the specific language
)
# Fit and transform the corpus
chunk_tfidf = vectorizer.fit_transform(wiki_chunks)
# 4. Load Target Docs
with open(f"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_{lang_code}_v1.json", "r") as f:
res = json.load(f)
my_target_documents = []
for item in res:
for key, value in item['diff_label_texts'].items():
my_target_documents.append({"index": item['index'], "label": key, "text": value})
root_dir = "/home/mshahidul/readctrl/data/data_annotator_data/tf_idf_anchors"
os.makedirs(root_dir, exist_ok=True)
save_path = f"{root_dir}/crowdsourcing_input_{lang_code}_v1.json"
processed_data = []
if os.path.exists(save_path):
with open(save_path, "r") as f:
processed_data = json.load(f)
processed_keys = {(d['index'], d['label']) for d in processed_data}
# 5. Processing Loop
print("Starting TF-IDF Matching Loop...")
for doc in tqdm.tqdm(my_target_documents):
if (doc['index'], doc['label']) in processed_keys:
continue
# A. TF-IDF Anchor Finding
# Transform current doc to same TF-IDF space
doc_tfidf = vectorizer.transform([doc['text']])
# Compute cosine similarity (linear_kernel is faster for TF-IDF)
cosine_similarities = linear_kernel(doc_tfidf, chunk_tfidf).flatten()
# Get top 25 indices
top_indices = cosine_similarities.argsort()[:-26:-1]
doc_len = len(doc['text'].split())
wiki_anchor = None
best_fallback = None
min_delta = float('inf')
for idx in top_indices:
cand_text = wiki_chunks[idx]
cand_len = len(cand_text.split())
len_diff = abs(cand_len - doc_len)
if len_diff < min_delta:
min_delta = len_diff
best_fallback = cand_text
if 0.8 <= (cand_len / doc_len) <= 1.2:
wiki_anchor = cand_text
break
if not wiki_anchor:
wiki_anchor = best_fallback
# B. Calculate Metrics
doc_metrics = {
"fkgl": textstat.flesch_kincaid_grade(doc['text']),
"word_count": doc_len
}
wiki_metrics = {
"fkgl": textstat.flesch_kincaid_grade(wiki_anchor),
"word_count": len(wiki_anchor.split())
}
# C. Store results
processed_data.append({
"index": doc['index'],
"label": doc['label'],
"original_doc": doc['text'],
"wiki_anchor": wiki_anchor,
"doc_fkgl": doc_metrics['fkgl'],
"wiki_fkgl": wiki_metrics['fkgl'],
"doc_tree_depth": get_parse_tree_stats(doc['text']),
"wiki_tree_depth": get_parse_tree_stats(wiki_anchor),
"fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl']
})
if len(processed_data) % 20 == 0:
with open(save_path, "w") as f:
json.dump(processed_data, f, indent=2)
# Final Save
with open(save_path, "w") as f:
json.dump(processed_data, f, indent=2) |