File size: 4,706 Bytes
1db7196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import argparse
import json
import tqdm
import numpy as np
import pandas as pd
import textstat
import spacy
import torch
from datasets import load_dataset

# Replacement for SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Environment Setup


parser = argparse.ArgumentParser()
parser.add_argument("--lang", type=str, default="pt", help="language code")
args = parser.parse_args()
lang_code = args.lang

# Load Spacy
nlp = spacy.load(f"{lang_code}_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"])

def get_parse_tree_stats(text):
    doc = nlp(text)
    depths = []
    for sent in doc.sents:
        def walk_tree(node, depth):
            if not list(node.children): return depth
            return max(walk_tree(child, depth + 1) for child in node.children)
        depths.append(walk_tree(sent.root, 1))
    return np.mean(depths) if depths else 0

# 1. Data Loading
print(f"Loading Wikipedia for {lang_code}...")
ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
# wiki_list = [item['text'] for item in ds.take(1000000)] 
wiki_list = [item['text'] for item in ds] 

# 2. PRE-PROCESS WIKI ANCHORS
print("Chunking Wikipedia...")
wiki_chunks = []
for text in wiki_list:
    paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
    wiki_chunks.extend(paragraphs)

# 3. TF-IDF VECTORIZATION
print("Computing TF-IDF Vectors (this may take a few minutes)...")
vectorizer = TfidfVectorizer(
    max_features=50000, # Prevents the matrix from exploding in memory
    stop_words=None      # You might want to pass a list for the specific language
)
# Fit and transform the corpus
chunk_tfidf = vectorizer.fit_transform(wiki_chunks)

# 4. Load Target Docs
with open(f"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_{lang_code}_v1.json", "r") as f:
    res = json.load(f)

my_target_documents = []
for item in res:
    for key, value in item['diff_label_texts'].items():
        my_target_documents.append({"index": item['index'], "label": key, "text": value})
root_dir = "/home/mshahidul/readctrl/data/data_annotator_data/tf_idf_anchors"
os.makedirs(root_dir, exist_ok=True)
save_path = f"{root_dir}/crowdsourcing_input_{lang_code}_v1.json"
processed_data = []
if os.path.exists(save_path):
    with open(save_path, "r") as f:
        processed_data = json.load(f)
processed_keys = {(d['index'], d['label']) for d in processed_data}

# 5. Processing Loop
print("Starting TF-IDF Matching Loop...")
for doc in tqdm.tqdm(my_target_documents):
    if (doc['index'], doc['label']) in processed_keys:
        continue

    # A. TF-IDF Anchor Finding
    # Transform current doc to same TF-IDF space
    doc_tfidf = vectorizer.transform([doc['text']])
    
    # Compute cosine similarity (linear_kernel is faster for TF-IDF)
    cosine_similarities = linear_kernel(doc_tfidf, chunk_tfidf).flatten()
    
    # Get top 25 indices
    top_indices = cosine_similarities.argsort()[:-26:-1]
    
    doc_len = len(doc['text'].split())
    wiki_anchor = None
    best_fallback = None
    min_delta = float('inf')

    for idx in top_indices:
        cand_text = wiki_chunks[idx]
        cand_len = len(cand_text.split())
        len_diff = abs(cand_len - doc_len)
        
        if len_diff < min_delta:
            min_delta = len_diff
            best_fallback = cand_text
            
        if 0.8 <= (cand_len / doc_len) <= 1.2:
            wiki_anchor = cand_text
            break
    
    if not wiki_anchor:
        wiki_anchor = best_fallback

    # B. Calculate Metrics
    doc_metrics = {
        "fkgl": textstat.flesch_kincaid_grade(doc['text']),
        "word_count": doc_len
    }
    wiki_metrics = {
        "fkgl": textstat.flesch_kincaid_grade(wiki_anchor),
        "word_count": len(wiki_anchor.split())
    }
    
    # C. Store results
    processed_data.append({
        "index": doc['index'],
        "label": doc['label'],
        "original_doc": doc['text'],
        "wiki_anchor": wiki_anchor,
        "doc_fkgl": doc_metrics['fkgl'],
        "wiki_fkgl": wiki_metrics['fkgl'],
        "doc_tree_depth": get_parse_tree_stats(doc['text']),
        "wiki_tree_depth": get_parse_tree_stats(wiki_anchor),
        "fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl']
    })

    if len(processed_data) % 20 == 0:
        with open(save_path, "w") as f:
            json.dump(processed_data, f, indent=2)

# Final Save
with open(save_path, "w") as f:
    json.dump(processed_data, f, indent=2)