File size: 4,974 Bytes

1db7196

import os
# Environment Setup
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import argparse
parser=argparse.ArgumentParser()
parser.add_argument("--lang",type=str,default="pt",help="language code")
args=parser.parse_args()
lang_code=args.lang

import json
import tqdm
import numpy as np
import pandas as pd
import textstat
import spacy
import torch
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset

# lang_code="pt"
# device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Load Models Efficiently
model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')
# Disable unnecessary components in Spacy to save time/memory
nlp = spacy.load(f"{lang_code}_core_news_sm", disable=["ner", "lemmatizer", "attribute_ruler"])

def get_parse_tree_stats(text):
    doc = nlp(text)
    depths = []
    for sent in doc.sents:
        def walk_tree(node, depth):
            if not list(node.children): return depth
            return max(walk_tree(child, depth + 1) for child in node.children)
        depths.append(walk_tree(sent.root, 1))
    return np.mean(depths) if depths else 0

# 2. Data Loading
ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
# Taking a subset for the anchor pool to keep memory manageable
wiki_list = [item['text'] for item in ds.take(1000000)] 

# 3. PRE-PROCESS WIKI ANCHORS (Do this ONCE)
print("Chunking and Encoding Wikipedia...")
wiki_chunks = []
for text in wiki_list:
    paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
    wiki_chunks.extend(paragraphs)

# Encode all chunks at once and keep on GPU
chunk_embs = model.encode(wiki_chunks, convert_to_tensor=True, show_progress_bar=True)

# 4. Load Target Docs
with open(f"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_{lang_code}_v1.json", "r") as f:
    res = json.load(f)

my_target_documents = []
for item in res:
    for key, value in item['diff_label_texts'].items():
        my_target_documents.append({"index": item['index'], "label": key, "text": value})

# Load Progress
save_path = f"/home/mshahidul/readctrl/data/data_annotator_data/crowdsourcing_input_{lang_code}_v1.json"
processed_data = []
if os.path.exists(save_path):
    with open(save_path, "r") as f:
        processed_data = json.load(f)
processed_keys = {(d['index'], d['label']) for d in processed_data}

# 5. Process with Batching logic where possible
print("Starting Matching Loop...")
for doc in tqdm.tqdm(my_target_documents):
    if (doc['index'], doc['label']) in processed_keys:
        continue

    # A. Robust Anchor Finding (Optimized)
    doc_emb = model.encode(doc['text'], convert_to_tensor=True)
    doc_len = len(doc['text'].split())
    
    hits = util.semantic_search(doc_emb, chunk_embs, top_k=25)[0]
    
    wiki_anchor = None
    best_fallback = None
    min_delta = float('inf')

    for hit in hits:
        cand_text = wiki_chunks[hit['corpus_id']]
        cand_len = len(cand_text.split())
        len_diff = abs(cand_len - doc_len)
        
        # Track fallback while looking for strict match
        if len_diff < min_delta:
            min_delta = len_diff
            best_fallback = cand_text
            
        if 0.8 <= (cand_len / doc_len) <= 1.2:
            wiki_anchor = cand_text
            break
    
    if not wiki_anchor:
        wiki_anchor = best_fallback

    # B. Calculate Metrics
    doc_metrics = {
        "fkgl": textstat.flesch_kincaid_grade(doc['text']),
        "word_count": doc_len
    }
    wiki_metrics = {
        "fkgl": textstat.flesch_kincaid_grade(wiki_anchor),
        "word_count": len(wiki_anchor.split())
    }
    
    # C. Store results
    processed_data.append({
        "index": doc['index'],
        "label": doc['label'],
        "original_doc": doc['text'],
        "wiki_anchor": wiki_anchor,
        "doc_fkgl": doc_metrics['fkgl'],
        "wiki_fkgl": wiki_metrics['fkgl'],
        "doc_tree_depth": get_parse_tree_stats(doc['text']),
        "wiki_tree_depth": get_parse_tree_stats(wiki_anchor),
        "fkgl_delta": doc_metrics['fkgl'] - wiki_metrics['fkgl']
    })

    # Save every 20 to reduce disk I/O overhead
    if len(processed_data) % 20 == 0:
        with open(save_path, "w") as f:
            json.dump(processed_data, f, indent=2)

# Final Save
with open(save_path, "w") as f:
    json.dump(processed_data, f, indent=2)


# python /home/mshahidul/readctrl/code/vectordb_build/data_annotate_data_prep_test_v2.py --lang pt
python /home/mshahidul/readctrl/code/vectordb_build/data_annotate_data_prep_test_v2.py --lang en
# python /home/mshahidul/readctrl/code/vectordb_build/data_annotate_data_prep_test_v2.py --lang es
# python /home/mshahidul/readctrl/code/vectordb_build/data_annotate_data_prep_test_v2.py --lang fr
python /home/mshahidul/readctrl/code/readability_control.py