readctrl / code /vectordb_build /data_annotate_data_prep copy.py

Add files using upload-large-folder tool

c7a6fe6 verified 27 days ago

4.28 kB

	import os
	# Environment Setup
	# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
	os.environ["CUDA_VISIBLE_DEVICES"] = "4"
	import json
	import tqdm
	import numpy as np
	import pandas as pd
	import textstat
	import spacy
	import torch
	from sentence_transformers import SentenceTransformer, util
	from datasets import load_dataset


	device = "cuda" if torch.cuda.is_available() else "cpu"

	# 1. Load Models Efficiently
	model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B').to(device)
	# Disable unnecessary components in Spacy to save time/memory
	nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "attribute_ruler"])

	def get_parse_tree_stats(text):
	doc = nlp(text)
	depths = []
	for sent in doc.sents:
	def walk_tree(node, depth):
	if not list(node.children): return depth
	return max(walk_tree(child, depth + 1) for child in node.children)
	depths.append(walk_tree(sent.root, 1))
	return np.mean(depths) if depths else 0

	# 2. Data Loading
	ds = load_dataset("wikimedia/wikipedia", "20231101.en", split='train', streaming=True)
	# Taking a subset for the anchor pool to keep memory manageable
	wiki_list = [item['text'] for item in ds.take(1000000)]

	# 3. PRE-PROCESS WIKI ANCHORS (Do this ONCE)
	print("Chunking and Encoding Wikipedia...")
	wiki_chunks = []
	for text in wiki_list:
	paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
	wiki_chunks.extend(paragraphs)

	# Encode all chunks at once and keep on GPU
	chunk_embs = model.encode(wiki_chunks, convert_to_tensor=True, show_progress_bar=True).to(device)

	# 4. Load Target Docs
	with open("/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_v1.json", "r") as f:
	res = json.load(f)

	my_target_documents = []
	for item in res:
	for key, value in item['diff_label_texts'].items():
	my_target_documents.append({"index": item['index'], "label": key, "text": value})

	# Load Progress
	save_path = "/home/mshahidul/readctrl/data/data_annotator_data/crowdsourcing_input_en_v2.json"
	processed_data = []
	if os.path.exists(save_path):
	with open(save_path, "r") as f:
	processed_data = json.load(f)
	processed_keys = {(d['index'], d['label']) for d in processed_data}

	# 5. Process with Batching logic where possible
	print("Starting Matching Loop...")
	for doc in tqdm.tqdm(my_target_documents):
	if (doc['index'], doc['label']) in processed_keys:
	continue

	# A. Robust Anchor Finding (Optimized)
	doc_emb = model.encode(doc['text'], convert_to_tensor=True).to(device)
	doc_len = len(doc['text'].split())

	hits = util.semantic_search(doc_emb, chunk_embs, top_k=25)[0]

	wiki_anchor = None
	best_fallback = None
	min_delta = float('inf')

	for hit in hits:
	cand_text = wiki_chunks[hit['corpus_id']]
	cand_len = len(cand_text.split())
	len_diff = abs(cand_len - doc_len)

	# Track fallback while looking for strict match
	if len_diff < min_delta:
	min_delta = len_diff
	best_fallback = cand_text

	if 0.8 <= (cand_len / doc_len) <= 1.2:
	wiki_anchor = cand_text
	break

	if not wiki_anchor:
	wiki_anchor = best_fallback

	# B. Calculate Metrics
	doc_metrics = {
	"fkgl": textstat.flesch_kincaid_grade(doc['text']),
	"word_count": doc_len
	}
	wiki_metrics = {
	"fkgl": textstat.flesch_kincaid_grade(wiki_anchor),
	"word_count": len(wiki_anchor.split())
	}

	# C. Store results
	processed_data.append({
	"index": doc['index'],
	"label": doc['label'],
	"original_doc": doc['text'],
	"wiki_anchor": wiki_anchor,
	"doc_fkgl": doc_metrics['fkgl'],
	"wiki_fkgl": wiki_metrics['fkgl'],
	"doc_tree_depth": get_parse_tree_stats(doc['text']),
	"wiki_tree_depth": get_parse_tree_stats(wiki_anchor),
	"fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl']
	})

	# Save every 20 to reduce disk I/O overhead
	if len(processed_data) % 20 == 0:
	with open(save_path, "w") as f:
	json.dump(processed_data, f, indent=2)

	# Final Save
	with open(save_path, "w") as f:
	json.dump(processed_data, f, indent=2)