readctrl / code /vectordb_build /data_annotate_data_prep_test_v2.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import argparse
import json
import tqdm
import numpy as np
import pandas as pd
import textstat
import spacy
import torch
from datasets import load_dataset
# Replacement for SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# Environment Setup
parser = argparse.ArgumentParser()
parser.add_argument("--lang", type=str, default="pt", help="language code")
args = parser.parse_args()
lang_code = args.lang
# Load Spacy
nlp = spacy.load(f"{lang_code}_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"])
def get_parse_tree_stats(text):
doc = nlp(text)
depths = []
for sent in doc.sents:
def walk_tree(node, depth):
if not list(node.children): return depth
return max(walk_tree(child, depth + 1) for child in node.children)
depths.append(walk_tree(sent.root, 1))
return np.mean(depths) if depths else 0
# 1. Data Loading
print(f"Loading Wikipedia for {lang_code}...")
ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
# wiki_list = [item['text'] for item in ds.take(1000000)]
wiki_list = [item['text'] for item in ds]
# 2. PRE-PROCESS WIKI ANCHORS
print("Chunking Wikipedia...")
wiki_chunks = []
for text in wiki_list:
paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
wiki_chunks.extend(paragraphs)
# 3. TF-IDF VECTORIZATION
print("Computing TF-IDF Vectors (this may take a few minutes)...")
vectorizer = TfidfVectorizer(
max_features=50000, # Prevents the matrix from exploding in memory
stop_words=None # You might want to pass a list for the specific language
)
# Fit and transform the corpus
chunk_tfidf = vectorizer.fit_transform(wiki_chunks)
# 4. Load Target Docs
with open(f"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_{lang_code}_v1.json", "r") as f:
res = json.load(f)
my_target_documents = []
for item in res:
for key, value in item['diff_label_texts'].items():
my_target_documents.append({"index": item['index'], "label": key, "text": value})
root_dir = "/home/mshahidul/readctrl/data/data_annotator_data/tf_idf_anchors"
os.makedirs(root_dir, exist_ok=True)
save_path = f"{root_dir}/crowdsourcing_input_{lang_code}_v1.json"
processed_data = []
if os.path.exists(save_path):
with open(save_path, "r") as f:
processed_data = json.load(f)
processed_keys = {(d['index'], d['label']) for d in processed_data}
# 5. Processing Loop
print("Starting TF-IDF Matching Loop...")
for doc in tqdm.tqdm(my_target_documents):
if (doc['index'], doc['label']) in processed_keys:
continue
# A. TF-IDF Anchor Finding
# Transform current doc to same TF-IDF space
doc_tfidf = vectorizer.transform([doc['text']])
# Compute cosine similarity (linear_kernel is faster for TF-IDF)
cosine_similarities = linear_kernel(doc_tfidf, chunk_tfidf).flatten()
# Get top 25 indices
top_indices = cosine_similarities.argsort()[:-26:-1]
doc_len = len(doc['text'].split())
wiki_anchor = None
best_fallback = None
min_delta = float('inf')
for idx in top_indices:
cand_text = wiki_chunks[idx]
cand_len = len(cand_text.split())
len_diff = abs(cand_len - doc_len)
if len_diff < min_delta:
min_delta = len_diff
best_fallback = cand_text
if 0.8 <= (cand_len / doc_len) <= 1.2:
wiki_anchor = cand_text
break
if not wiki_anchor:
wiki_anchor = best_fallback
# B. Calculate Metrics
doc_metrics = {
"fkgl": textstat.flesch_kincaid_grade(doc['text']),
"word_count": doc_len
}
wiki_metrics = {
"fkgl": textstat.flesch_kincaid_grade(wiki_anchor),
"word_count": len(wiki_anchor.split())
}
# C. Store results
processed_data.append({
"index": doc['index'],
"label": doc['label'],
"original_doc": doc['text'],
"wiki_anchor": wiki_anchor,
"doc_fkgl": doc_metrics['fkgl'],
"wiki_fkgl": wiki_metrics['fkgl'],
"doc_tree_depth": get_parse_tree_stats(doc['text']),
"wiki_tree_depth": get_parse_tree_stats(wiki_anchor),
"fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl']
})
if len(processed_data) % 20 == 0:
with open(save_path, "w") as f:
json.dump(processed_data, f, indent=2)
# Final Save
with open(save_path, "w") as f:
json.dump(processed_data, f, indent=2)