In [1]:
import pandas as pd
import numpy as np
import csv
import sys

def process_chunk(chunk, source_lang, target_lang, lower=False):
 # Drop nan
 chunk = chunk.dropna()
 print("--- Rows with Empty Cells Deleted\t--> Rows:", chunk.shape[0])

 # Drop duplicates
 chunk = chunk.drop_duplicates()
 print("--- Duplicates Deleted\t\t\t--> Rows:", chunk.shape[0])

 # Drop copy-source rows
 chunk["Source-Copied"] = chunk['Source'] == chunk['Target']
 chunk = chunk.set_index(['Source-Copied'])
 chunk = chunk.drop([True], errors='ignore')
 chunk = chunk.reset_index()
 chunk = chunk.drop(['Source-Copied'], axis=1)
 print("--- Source-Copied Rows Deleted\t\t--> Rows:", chunk.shape[0])

 # Drop too-long rows (source or target)
 chunk["Too-Long"] = ((chunk['Source'].str.count(' ')+1) > (chunk['Target'].str.count(' ')+1) * 2) | \
 ((chunk['Target'].str.count(' ')+1) > (chunk['Source'].str.count(' ')+1) * 2) | \
 ((chunk['Source'].str.count(' ')+1) > 200) | \
 ((chunk['Target'].str.count(' ')+1) > 200)

 chunk = chunk.set_index(['Too-Long'])
 chunk = chunk.drop([True], errors='ignore')
 chunk = chunk.reset_index()
 chunk = chunk.drop(['Too-Long'], axis=1)
 print("--- Too Long Source/Target Deleted\t--> Rows:", chunk.shape[0])

 # Remove HTML and normalize
 chunk = chunk.replace(r'<.*?>|<.*?>|&?(amp|nbsp|quot);|{}', ' ', regex=True)
 chunk = chunk.replace(r' ', ' ', regex=True)
 print("--- HTML Removed\t\t\t--> Rows:", chunk.shape[0])

 # Lower-case the data if specified
 if lower:
 chunk['Source'] = chunk['Source'].str.lower()
 chunk['Target'] = chunk['Target'].str.lower()
 print("--- Rows are now lower-cased\t\t--> Rows:", chunk.shape[0])

 # Replace empty cells with NaN and delete them
 chunk = chunk.replace(r'^\s*$', np.nan, regex=True)
 chunk = chunk.dropna()
 print("--- Rows with Empty Cells Deleted\t--> Rows:", chunk.shape[0])

 # Shuffle the data
 chunk = chunk.sample(frac=1).reset_index(drop=True)
 print("--- Rows Shuffled\t\t\t--> Rows:", chunk.shape[0])

 return chunk

def filter_and_save(source_file, target_file, source_lang, target_lang, chunk_size=100000, lower=False):
 # Open output files for source and target
 source_file_out = source_file + '-filtered.' + source_lang
 target_file_out = target_file + '-filtered.' + target_lang

 with open(source_file_out, 'w', encoding='utf-8', newline='') as source_out, \
 open(target_file_out, 'w', encoding='utf-8', newline='') as target_out:
 
 source_writer = csv.writer(source_out, delimiter='\n')
 target_writer = csv.writer(target_out, delimiter='\n')

 # Process the source and target files in chunks
 for chunk_source, chunk_target in zip(pd.read_csv(source_file, names=['Source'], sep="\0", quoting=csv.QUOTE_NONE, skip_blank_lines=False, on_bad_lines="skip", chunksize=chunk_size),
 pd.read_csv(target_file, names=['Target'], sep="\0", quoting=csv.QUOTE_NONE, skip_blank_lines=False, on_bad_lines="skip", chunksize=chunk_size)):
 
 # Concatenate source and target data
 df = pd.concat([chunk_source, chunk_target], axis=1)
 print("Processing chunk... Rows:", df.shape[0])

 # Filter and process the chunk
 filtered_chunk = process_chunk(df, source_lang, target_lang, lower)

 # Write the filtered chunk to the output files
 filtered_chunk['Source'].to_csv(source_out, header=False, index=False, quoting=csv.QUOTE_NONE, sep="\n")
 filtered_chunk['Target'].to_csv(target_out, header=False, index=False, quoting=csv.QUOTE_NONE, sep="\n")
 
 print("Processing complete. Filtered files saved as:", source_file_out, target_file_out)

if __name__ == '__main__':
 # Corpora details
 source_file = "train.eng_Latn" # path to the source file
 target_file = "train.guj_Gujr" # path to the target file
 source_lang = "en" # source language
 target_lang = "gu" # target language
 
 # Run the filter_and_save function
 filter_and_save(source_file, target_file, source_lang, target_lang, chunk_size=10000000, lower=False)


Processing chunk... Rows: 10000000
--- Rows with Empty Cells Deleted	--> Rows: 9999999
--- Duplicates Deleted			--> Rows: 9999306
--- Source-Copied Rows Deleted		--> Rows: 9999302
--- Too Long Source/Target Deleted	--> Rows: 9889531
--- HTML Removed			--> Rows: 9889531
--- Rows with Empty Cells Deleted	--> Rows: 9889521
--- Rows Shuffled			--> Rows: 9889521
Processing chunk... Rows: 10000000
--- Rows with Empty Cells Deleted	--> Rows: 9999997
--- Duplicates Deleted			--> Rows: 9999354
--- Source-Copied Rows Deleted		--> Rows: 9999348
--- Too Long Source/Target Deleted	--> Rows: 9893544
--- HTML Removed			--> Rows: 9893544
--- Rows with Empty Cells Deleted	--> Rows: 9893537
--- Rows Shuffled			--> Rows: 9893537
Processing chunk... Rows: 10000000
--- Rows with Empty Cells Deleted	--> Rows: 9999998
--- Duplicates Deleted			--> Rows: 9999360
--- Source-Copied Rows Deleted		--> Rows: 9999357
--- Too Long Source/Target Deleted	--> Rows: 9898546
--- HTML Removed			--> Rows: 9898546
--- Rows w

In [1]:
import os

In [2]:
pwd

'/home/vikrant-MNMT/myenv/BPCC/inline_tages'

In [3]:
os.chdir("/home/vikrant-MNMT/myenv/BPCC/inline_tages/eng_Latn-guj_Gujr")

In [1]:
pwd

'/home/vikrant-MNMT/myenv/NMT_V2'

In [8]:
!head -n 20 train_aggressively_shuffled.tgt

Did you want to find it?
I have read all of the letters sent to me.
यदि आपके पास कोई दिलचस्पी वाले उत्पाद हैं, जिन्हें उत्पादित करने की आवश्यकता है, तो कृपया मुझसे संपर्क करने के लिए स्वतंत्र महसूस करें, यह आपकी सेवा करने की मेरी खुशी है।
તેને ધ્યાનમાં રાખતા સ્વાસ્થ્ય અને પરિવાર કલ્યાણ મંત્રાલેય વિદેશથી પ્રવાસ કરીને ભારત આવનારા માટે નવા નિયમ બહાર પાડ્યા છે.
“A discussion and debate should be held on this.
At least seven to eight ministers could be inducted into Chief Minister Yogi Adityanath's cabinet.
95% of primary schools do not have computers
Sample rate:
ಈ ಸಮಯದಲ್ಲಿ ಶೌಚಾಲಯಗಳು ಮುಚ್ಚಲ್ಪಟ್ಟಿವೆ.
For reasons of transparency, we point out that we use Google Tag Manager.
18 Find The Best View From The Lion's Gate Bridge
If there's anything you need, tell me and I'll grab it.
यदि आप छोड़ने के तरीकों में रुचि रखते हैं, तो कोशिश करने के नए तरीके हैं, जैसे निकोटिन रिप्लेसमेंट थेरेपी।
Share His Music to the World
I take my children with me.
କିନ୍ତୁ ଆଜି ସକାଳେ ସେ ଆସିନଥିଲେ ।
Regardless of what kin

In [7]:
!head -n 100 train_aggressively_shuffled.src

क्या आप इसे खोजना चाहते थे?
എനിക്ക് വേണ്ടി കിട്ടിയിരുന്ന എല്ലാ കത്തുകളും ഞാൻ വായിച്ചിട്ടുണ്ട്.
If you have any interested products that need to be produced, please feel free to contact me, it's my pleasure to serve you
In view of this, the Ministry of Health and Family Welfare has issued a new rule for those coming to India from abroad.
ਉਨ੍ਹਾਂ ਕਿਹਾ ਕਿ ਇਸ ਸਬੰਧੀ ਬਹਿਸ ਜ਼ਰੂਰ ਹੋਣੀ ਚਾਹੀਦੀ ਹੈ।
मुख्यमंत्री योगी आदित्यनाथ की कैबिनेट में कम से कम सात से आठ मंत्रियों को शामिल किया जा सकता है।
৯২ ভাগ স্কুলে নেই কম্পিউটার সুবিধা
নমুনা হাৰ:
Restrooms are closed at this time.
पारदर्शकतेच्या कारणास्तव कृपया लक्षात घ्या की आम्ही Google टॅग व्यवस्थापक वापरतो.
18 शेर के गेट ब्रिज से सबसे अच्छा दृश्य खोजें
आपको किसी चीज की जरूरत है तो मुझे बता दो, मैं ला दूंगा।
If you're interested in ways to quit, there are new methods to try, such as nicotine replacement therapy.
अपने संगीत को दुनिया के साथ साझा करें
ನನ್ನ ಮಕ್ಕಳನ್ನೂ ನನ್ನ ಜೊತೆ ಕರೆದುಕೊಂಡು ಹೋಗುತ್ತಿದ್ದೇನೆ.
He was not here this morning.
তা সে যে ধরনের সম্পর্ক

In [1]:
!head -n 10 train.src 

en : রাজ্যের রাজধানী!
en : নিচে বিস্তারিত.
en : সেই একই কারণে!
en : আমার ছেলে একটি মেয়ের প্রেমে পড়েছে
en : bash এর একটি বিশেষ চরিত্র, এটি পূর্ববর্তী কমান্ড উল্লেখ করতে ব্যবহৃত হয়।
en : অর্থের মূল্য !
en : সন্ত্রাসিরা খুন করেছে।
en : গোটা শরীর জুড়ে!
en : আজ দেশের সব বিভাগে
en : স্ত্রী আর তিন সন্তান!


In [2]:
!git clone https://github.com/ymoslem/MT-Preparation.git

Cloning into 'MT-Preparation'...
remote: Enumerating objects: 305, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 305 (delta 66), reused 114 (delta 58), pack-reused 174 (from 1)[K
Receiving objects: 100% (305/305), 84.51 KiB | 1.96 MiB/s, done.
Resolving deltas: 100% (149/149), done.


In [3]:
!wget https://huggingface.co/Vikrantyadav11234/NMT_multilingual_12/resolve/main/target.model

--2024-12-16 15:13:01-- https://huggingface.co/Vikrantyadav11234/NMT_multilingual_12/resolve/main/target.model
Resolving huggingface.co (huggingface.co)... 18.172.134.4, 18.172.134.124, 18.172.134.88, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/39/ee/39ee14e585e8e2e88fd5b98517c223831219784b29620228589df42bb9e6c6e6/6b0a6b669581c874390272a0aa8c77e5e5394b975b5f7bd42928fe7d8bb046b8?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27target.model%3B+filename%3D%22target.model%22%3B&Expires=1734621181&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNDYyMTE4MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzM5L2VlLzM5ZWUxNGU1ODVlOGUyZTg4ZmQ1Yjk4NTE3YzIyMzgzMTIxOTc4NGIyOTYyMDIyODU4OWRmNDJiYjllNmM2ZTYvNmIwYTZiNjY5NTgxYzg3NDM5MDI3MmEwYWE4Yzc3ZTVlNTM5NGI5NzViNWY3YmQ0MjkyOGZlN2Q4YmIwNDZiOD9yZXNwb25zZS1

In [4]:
!wget https://huggingface.co/Vikrantyadav11234/NMT_multilingual_12/resolve/main/source.model

--2024-12-16 15:13:32-- https://huggingface.co/Vikrantyadav11234/NMT_multilingual_12/resolve/main/source.model
Resolving huggingface.co (huggingface.co)... 18.160.143.75, 18.160.143.76, 18.160.143.32, ...
Connecting to huggingface.co (huggingface.co)|18.160.143.75|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/39/ee/39ee14e585e8e2e88fd5b98517c223831219784b29620228589df42bb9e6c6e6/076e8b22e4cf7baff9c9d0897a19da65b77e53064e84e631d883cebfa3b20466?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27source.model%3B+filename%3D%22source.model%22%3B&Expires=1734621212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNDYyMTIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzM5L2VlLzM5ZWUxNGU1ODVlOGUyZTg4ZmQ1Yjk4NTE3YzIyMzgzMTIxOTc4NGIyOTYyMDIyODU4OWRmNDJiYjllNmM2ZTYvMDc2ZThiMjJlNGNmN2JhZmY5YzlkMDg5N2ExOWRhNjViNzdlNTMwNjRlODRlNjMxZDg4M2NlYmZhM2IyMDQ2Nj9yZXNwb25zZS

In [6]:
!python3 MT-Preparation/subwording/2-subword.py source.model target.model train.src train.tgt

Source Model: source.model
Target Model: target.model
Source Dataset: train.src
Target Dataset: train.tgt


Done subwording the source file! Output: train.src.subword
Done subwording the target file! Output: train.tgt.subword


In [None]:
!huggingface-cli upload 
