Vikrantyadav11234
/

Important_NMT_DOCs

Model card Files Files and versions

xet

Community

Vikrantyadav11234 commited on Dec 17, 2024

Commit

f35a7b1

verified ·

1 Parent(s): da7b0e1

Upload create_inline_tags.py with huggingface_hub

Browse files

Files changed (1) hide show

create_inline_tags.py +170 -0

create_inline_tags.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import fasttext
+from indic_transliteration import sanscript
+from indic_transliteration.sanscript import transliterate
+import re
+from tqdm import tqdm
+from functools import lru_cache
+import os
+import urllib.request
+# Check if CUDA is available and set the device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Load model and tokenizer for NER
+ner_model_name = "xlm-roberta-large-finetuned-conll03-english"
+ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
+ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_name).to(device)
+# Create NER pipeline
+ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, device=0 if torch.cuda.is_available() else -1, aggregation_strategy="simple")
+# # Load FastText model
+# fasttext_model_dir = '/home/vikrant-MNMT/myenv/fasttext_model'
+# fasttext_model_path = os.path.join(fasttext_model_dir, 'lid.176.ftz')
+# if not os.path.exists(fasttext_model_path):
+#     print("Downloading FastText model...")
+#     os.makedirs(fasttext_model_dir, exist_ok=True)
+#     urllib.request.urlretrieve("https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz", fasttext_model_path)
+fasttext_model = fasttext.load_model("/home/vikrant-MNMT/myenv/fasttext_model/lid.176.ftz")
+@lru_cache(maxsize=10000)
+def extract_entities(sentence):
+    entities = ner_pipeline(sentence)
+    return tuple((ent['word'], ent['entity_group']) for ent in entities)
+@lru_cache(maxsize=10000)
+def detect_language(text):
+    predictions = fasttext_model.predict(text, k=1)
+    return predictions[0][0].split('__label__')[1]
+@lru_cache(maxsize=10000)
+def transliterate_to_latin(text, lang):
+    if lang == 'hi' or lang == 'mr':
+        return transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
+    elif lang == 'pa':
+        return transliterate(text, sanscript.GURMUKHI, sanscript.ITRANS)
+    elif lang == 'gu':
+        return transliterate(text, sanscript.GUJARATI, sanscript.ITRANS)
+    elif lang == 'bn' or lang == 'as':  # Bengali and Assamese use the same script
+        return transliterate(text, sanscript.BENGALI, sanscript.ITRANS)
+    elif lang == 'ur':
+        return text  # Urdu is already in Latin script in our test cases
+    elif lang == 'ml':
+        return transliterate(text, sanscript.MALAYALAM, sanscript.ITRANS)
+    elif lang == 'ta':
+        return transliterate(text, sanscript.TAMIL, sanscript.ITRANS)
+    elif lang == 'te':
+        return transliterate(text, sanscript.TELUGU, sanscript.ITRANS)
+    elif lang == 'kn':
+        return transliterate(text, sanscript.KANNADA, sanscript.ITRANS)
+    elif lang == 'or':
+        return transliterate(text, sanscript.ORIYA, sanscript.ITRANS)
+    else:
+        return text  # Return as is for unsupported languages
+@lru_cache(maxsize=100000)
+def normalize(text):
+    # Remove all non-alphanumeric characters and convert to lowercase
+    return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
+def partial_match(s1, s2, threshold=0.7):
+    s1_norm = normalize(s1)
+    s2_norm = normalize(s2)
+    return (s1_norm in s2_norm or s2_norm in s1_norm) or \
+           (len(s1_norm) >= 4 and s1_norm[:4] == s2_norm[:4])
+def process_pair(source, target):
+    source = source.strip()
+    target = target.strip()
+    source_lang = detect_language(source)
+    target_lang = detect_language(target)
+    # Determine which sentence is English
+    if source_lang == 'en':
+        en_sentence, other_sentence = source, target
+        en_entities, other_entities = extract_entities(source), extract_entities(target)
+        other_lang = target_lang
+    elif target_lang == 'en':
+        en_sentence, other_sentence = target, source
+        en_entities, other_entities = extract_entities(target), extract_entities(source)
+        other_lang = source_lang
+    else:
+        return [], []  # If neither is English, return no tags
+    pair_tags_en_other = []
+    pair_tags_other_en = []
+    for en_word, en_tag in en_entities:
+        for other_word, other_tag in other_entities:
+            if en_tag == other_tag:
+                en_norm = normalize(en_word)
+                other_trans = transliterate_to_latin(other_word, other_lang)
+                other_norm = normalize(other_trans)
+                if partial_match(en_norm, other_norm):
+                    # Skip if either word is empty
+                    if en_word.strip() and other_word.strip():
+                        pair_tags_en_other.append(f"en: {en_word}\t{other_lang}: {other_word}\t{en_tag}")
+                        pair_tags_other_en.append(f"{other_lang}: {other_word}\ten: {en_word}\t{en_tag}")
+    return pair_tags_en_other, pair_tags_other_en
+def batch_generator(source_file, target_file, batch_size):
+    with open(source_file, 'r', encoding='utf-8') as src, open(target_file, 'r', encoding='utf-8') as tgt:
+        source_batch, target_batch = [], []
+        for source_line, target_line in zip(src, tgt):
+            source_batch.append(source_line)
+            target_batch.append(target_line)
+            if len(source_batch) == batch_size:
+                yield source_batch, target_batch
+                source_batch, target_batch = [], []
+        if source_batch:
+            yield source_batch, target_batch
+def create_dataset(source_file, target_file, output_file_en_other, output_file_other_en, batch_size=32):
+    total_lines = sum(1 for _ in open(source_file, 'r', encoding='utf-8'))
+    print(f"Processing {total_lines} lines from {source_file} and {target_file}")
+    total_tags_en_other = 0
+    total_tags_other_en = 0
+    with open(output_file_en_other, "w", encoding="utf-8") as f_en_other, \
+         open(output_file_other_en, "w", encoding="utf-8") as f_other_en:
+        for i, (source_batch, target_batch) in enumerate(tqdm(batch_generator(source_file, target_file, batch_size),
+                                                              total=total_lines//batch_size)):
+            batch_tags_en_other = []
+            batch_tags_other_en = []
+            for source, target in zip(source_batch, target_batch):
+                pair_tags_en_other, pair_tags_other_en = process_pair(source, target)
+                batch_tags_en_other.extend(pair_tags_en_other)
+                batch_tags_other_en.extend(pair_tags_other_en)
+            if batch_tags_en_other:
+                f_en_other.write("\n".join(batch_tags_en_other) + "\n")
+                f_en_other.flush()  # Ensure data is written to disk
+                total_tags_en_other += len(batch_tags_en_other)
+            if batch_tags_other_en:
+                f_other_en.write("\n".join(batch_tags_other_en) + "\n")
+                f_other_en.flush()  # Ensure data is written to disk
+                total_tags_other_en += len(batch_tags_other_en)
+            if (i + 1) % 1000 == 0:
+                print(f"Processed {(i + 1) * batch_size} lines. Current tag count: {total_tags_en_other} (en-other), {total_tags_other_en} (other-en)")
+    print(f"Inline tags extraction completed. {total_tags_en_other} tags saved to {output_file_en_other}.")
+    print(f"Inline tags extraction completed. {total_tags_other_en} tags saved to {output_file_other_en}.")
+def main():
+    source_file = '/home/vikrant-MNMT/myenv/NMT_V2/train_aggressively_shuffled.src'
+    target_file = '/home/vikrant-MNMT/myenv/NMT_V2/train_aggressively_shuffled.tgt'
+    output_file_en_other = "/home/vikrant-MNMT/myenv/BPCC/inline_tages/eng_Latn-hin_Deva/inline_tag_1.txt"
+    output_file_other_en = "/home/vikrant-MNMT/myenv/BPCC/inline_tages/eng_Latn-hin_Deva/inline_tag_2.txt"
+    batch_size = 1000
+    create_dataset(source_file, target_file, output_file_en_other, output_file_other_en, batch_size)
+if __name__ == "__main__":
+    main()