readctrl / code /vectordb_build /data_annotate_data_prep_test_v2.py

Add files using upload-large-folder tool

1db7196 verified 27 days ago

4.71 kB

	import os
	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
	os.environ["CUDA_VISIBLE_DEVICES"] = "2"
	import argparse
	import json
	import tqdm
	import numpy as np
	import pandas as pd
	import textstat
	import spacy
	import torch
	from datasets import load_dataset

	# Replacement for SentenceTransformer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import linear_kernel

	# Environment Setup


	parser = argparse.ArgumentParser()
	parser.add_argument("--lang", type=str, default="pt", help="language code")
	args = parser.parse_args()
	lang_code = args.lang

	# Load Spacy
	nlp = spacy.load(f"{lang_code}_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"])

	def get_parse_tree_stats(text):
	doc = nlp(text)
	depths = []
	for sent in doc.sents:
	def walk_tree(node, depth):
	if not list(node.children): return depth
	return max(walk_tree(child, depth + 1) for child in node.children)
	depths.append(walk_tree(sent.root, 1))
	return np.mean(depths) if depths else 0

	# 1. Data Loading
	print(f"Loading Wikipedia for {lang_code}...")
	ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
	# wiki_list = [item['text'] for item in ds.take(1000000)]
	wiki_list = [item['text'] for item in ds]

	# 2. PRE-PROCESS WIKI ANCHORS
	print("Chunking Wikipedia...")
	wiki_chunks = []
	for text in wiki_list:
	paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
	wiki_chunks.extend(paragraphs)

	# 3. TF-IDF VECTORIZATION
	print("Computing TF-IDF Vectors (this may take a few minutes)...")
	vectorizer = TfidfVectorizer(
	max_features=50000, # Prevents the matrix from exploding in memory
	stop_words=None # You might want to pass a list for the specific language
	)
	# Fit and transform the corpus
	chunk_tfidf = vectorizer.fit_transform(wiki_chunks)

	# 4. Load Target Docs
	with open(f"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_{lang_code}_v1.json", "r") as f:
	res = json.load(f)

	my_target_documents = []
	for item in res:
	for key, value in item['diff_label_texts'].items():
	my_target_documents.append({"index": item['index'], "label": key, "text": value})
	root_dir = "/home/mshahidul/readctrl/data/data_annotator_data/tf_idf_anchors"
	os.makedirs(root_dir, exist_ok=True)
	save_path = f"{root_dir}/crowdsourcing_input_{lang_code}_v1.json"
	processed_data = []
	if os.path.exists(save_path):
	with open(save_path, "r") as f:
	processed_data = json.load(f)
	processed_keys = {(d['index'], d['label']) for d in processed_data}

	# 5. Processing Loop
	print("Starting TF-IDF Matching Loop...")
	for doc in tqdm.tqdm(my_target_documents):
	if (doc['index'], doc['label']) in processed_keys:
	continue

	# A. TF-IDF Anchor Finding
	# Transform current doc to same TF-IDF space
	doc_tfidf = vectorizer.transform([doc['text']])

	# Compute cosine similarity (linear_kernel is faster for TF-IDF)
	cosine_similarities = linear_kernel(doc_tfidf, chunk_tfidf).flatten()

	# Get top 25 indices
	top_indices = cosine_similarities.argsort()[:-26:-1]

	doc_len = len(doc['text'].split())
	wiki_anchor = None
	best_fallback = None
	min_delta = float('inf')

	for idx in top_indices:
	cand_text = wiki_chunks[idx]
	cand_len = len(cand_text.split())
	len_diff = abs(cand_len - doc_len)

	if len_diff < min_delta:
	min_delta = len_diff
	best_fallback = cand_text

	if 0.8 <= (cand_len / doc_len) <= 1.2:
	wiki_anchor = cand_text
	break

	if not wiki_anchor:
	wiki_anchor = best_fallback

	# B. Calculate Metrics
	doc_metrics = {
	"fkgl": textstat.flesch_kincaid_grade(doc['text']),
	"word_count": doc_len
	}
	wiki_metrics = {
	"fkgl": textstat.flesch_kincaid_grade(wiki_anchor),
	"word_count": len(wiki_anchor.split())
	}

	# C. Store results
	processed_data.append({
	"index": doc['index'],
	"label": doc['label'],
	"original_doc": doc['text'],
	"wiki_anchor": wiki_anchor,
	"doc_fkgl": doc_metrics['fkgl'],
	"wiki_fkgl": wiki_metrics['fkgl'],
	"doc_tree_depth": get_parse_tree_stats(doc['text']),
	"wiki_tree_depth": get_parse_tree_stats(wiki_anchor),
	"fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl']
	})

	if len(processed_data) % 20 == 0:
	with open(save_path, "w") as f:
	json.dump(processed_data, f, indent=2)

	# Final Save
	with open(save_path, "w") as f:
	json.dump(processed_data, f, indent=2)