diff --git a/.gitattributes b/.gitattributes index 68e0f229aa140643132dd585259ea1f63166ebe7..1f500ce9ee94a2998a7e5cc04e354c5a749f3583 100644 --- a/.gitattributes +++ b/.gitattributes @@ -37,3 +37,4 @@ vocoder/female/aryan/hifigan/generator filter=lfs diff=lfs merge=lfs -text vocoder/female/dravidian/hifigan/generator filter=lfs diff=lfs merge=lfs -text vocoder/male/aryan/hifigan/generator filter=lfs diff=lfs merge=lfs -text vocoder/male/dravidian/hifigan/generator filter=lfs diff=lfs merge=lfs -text +ssn_parser_new/scripts/tamil_trans_py filter=lfs diff=lfs merge=lfs -text diff --git a/api.py b/api.py new file mode 100644 index 0000000000000000000000000000000000000000..d92f2321c8796d43a05c5a41ac9756123275d267 --- /dev/null +++ b/api.py @@ -0,0 +1,63 @@ +# TTS IITM SPEECH LAB +import requests +import json +import base64 + +text = "सुप्रभात, आप कैसे हैं?" # hindi +# text = "സുപ്രഭാതം, സുഖമാ?" # malayalam +# text = "সুপ্ৰভাত, তুমি কেনে?" # manipuri +# text = "सुप्रभात, तुम्ही कसे आहात?" # marathi +# text = "ಶುಭೋದಯ, ನೀವು ಹೇಗಿದ್ದೀರಿ?" # kannada +# text = "बसु म्विथ्बो, बरि दिबाबो?" # bodo male not working <--- +# text = "Good morning, how are you?" # english +# text = "সুপ্ৰভাত, আপুনি কেমন আছে?" # assamese +# text = "காலை வணக்கம், நீங்கள் எப்படி இருக்கின்றீர்கள்?" # tamil +# text = "ସୁପ୍ରଭାତ, ଆପଣ କେମିତି ଅଛନ୍ତି?" # odia male not working <--- +# text = "सुप्रभात, आप कैसे छो?" # rajasthani +# text = "శుభోదయం, మీరు ఎలా ఉన్నారు?" # telugu +# text = "সুপ্রভাত, আপনি কেমন আছেন?" # bengali male not working <--- +# text = "સુપ્રભાત, તમે કેમ છો?" # gujarati + +lang = 'hindi' +gender = 'female' + +url = "http://localhost:4005/tts" +# url = 'http://projects.respark.iitm.ac.in:8009/tts' # proxy + +payload = json.dumps({ +"input": text, +"gender": gender, +"lang": lang, +"alpha": 1, +"segmentwise":"True" +}) +headers = {'Content-Type': 'application/json'} +response = requests.request("POST", url, headers=headers, data=payload).json() + +audio = response['audio'] +file_name = "tts.mp3" +wav_file = open(file_name,'wb') +decode_string = base64.b64decode(audio) +wav_file.write(decode_string) +wav_file.close() + +''' +Supported languages + +Assamese +Bengali +Bodo +English +Gujarati +Hindi +Kannada +Malayalam +Manipuri +Marathi +Odia +Punjabi +Rajasthani +Tamil +Telugu +Urdu +''' diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..cf11a010d9a6c621ff2ee4a0fe968c2ed789498a --- /dev/null +++ b/app.py @@ -0,0 +1,179 @@ +from flask import Flask, render_template, request, send_file, jsonify +import requests +import json +import ssl +import logging +import sys +import os +import base64 +import io +#replace the path with your hifigan path to import Generator from models.py +sys.path.append("hifigan") +# import argparse +import torch +from espnet2.bin.tts_inference import Text2Speech +from models import Generator +from scipy.io.wavfile import write +from meldataset import MAX_WAV_VALUE +from env import AttrDict +import json +import yaml +from text_preprocess_for_inference import TTSDurAlignPreprocessor +# import time + +logging.basicConfig(filename='access.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +SAMPLING_RATE = 22050 +if torch.cuda.is_available(): + device = "cuda" +else: + device = "cpu" + +preprocessor = TTSDurAlignPreprocessor() + +app = Flask(__name__) +# app.config['SECRET_KEY'] = 'key' +# socketio = SocketIO(app) + +# @socketio.on('new_user') +# def handle_new_user(data): +# client_id = data['id'] +# # print('\n'+f"New user connected with ID: {client_id}") +# logging.info('\n'+f"New user connected with ID: {client_id}") + +def load_hifigan_vocoder(language, gender, device): + # Load HiFi-GAN vocoder configuration file and generator model for the specified language and gender + vocoder_config = f"vocoder/{gender}/aryan/hifigan/config.json" + vocoder_generator = f"vocoder/{gender}/aryan/hifigan/generator" + # Read the contents of the vocoder configuration file + with open(vocoder_config, 'r') as f: + data = f.read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + # Move the generator model to the specified device (CPU or GPU) + device = torch.device(device) + generator = Generator(h).to(device) + state_dict_g = torch.load(vocoder_generator, device) + generator.load_state_dict(state_dict_g['generator']) + generator.eval() + generator.remove_weight_norm() + + # Return the loaded and prepared HiFi-GAN generator model + return generator + +def load_fastspeech2_model(language, gender, device): + + #updating the config.yaml fiel based on language and gender + with open(f"{language}/{gender}/model/config.yaml", "r") as file: + config = yaml.safe_load(file) + + current_working_directory = os.getcwd() + feat="model/feats_stats.npz" + pitch="model/pitch_stats.npz" + energy="model/energy_stats.npz" + + feat_path=os.path.join(current_working_directory,language,gender,feat) + pitch_path=os.path.join(current_working_directory,language,gender,pitch) + energy_path=os.path.join(current_working_directory,language,gender,energy) + + + config["normalize_conf"]["stats_file"] = feat_path + config["pitch_normalize_conf"]["stats_file"] = pitch_path + config["energy_normalize_conf"]["stats_file"] = energy_path + + with open(f"{language}/{gender}/model/config.yaml", "w") as file: + yaml.dump(config, file) + + tts_model = f"{language}/{gender}/model/model.pth" + tts_config = f"{language}/{gender}/model/config.yaml" + + + return Text2Speech(train_config=tts_config, model_file=tts_model, device=device) + +def text_synthesis(language, gender, sample_text, vocoder, MAX_WAV_VALUE, device, alpha=1): + # Perform Text-to-Speech synthesis + with torch.no_grad(): + # Load the FastSpeech2 model for the specified language and gender + + model = load_fastspeech2_model(language, gender, device) + + # Generate mel-spectrograms from the input text using the FastSpeech2 model + out = model(sample_text, decode_conf={"alpha": alpha}) + print("TTS Done") + x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262 + x = x.to(device) + + # Use the HiFi-GAN vocoder to convert mel-spectrograms to raw audio waveforms + y_g_hat = vocoder(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype('int16') + + # Return the synthesized audio + return audio + +def setup_app(): + genders = ['male','female'] + # to make dummy calls in all languages available + languages = {'hindi': "नमस्ते",'malayalam': "ഹലോ",'manipuri': "হ্যালো",'marathi': "हॅलो",'kannada': "ಹಲೋ",'bodo': "हॅलो",'english': "Hello",'assamese': "হ্যালো",'tamil': "ஹலோ",'odia': "ହେଲୋ",'rajasthani': "हॅलो",'telugu': "హలో",'bengali': "হ্যালো",'gujarati': "હલો"} + + vocoders = {} + for gender in genders: + vocoders[gender]={} + for language,text in languages.items(): + # Load the HiFi-GAN vocoder with dynamic language and gender + vocoder = load_hifigan_vocoder(language, gender, device) + vocoders[gender][language] = vocoder + # dummy calls + print(f"making dummy calls for {language} - {gender}") + try: + out = text_synthesis(language, gender, text, vocoder, MAX_WAV_VALUE, device) + except: + message = f"cannot make dummy call for {gender} - {language} <===================" + print(message.upper()) + + print("Server Started...") + return vocoders +vocoders = setup_app() + +@app.route('/', methods=['GET']) +def main(): + return "IITM_TTS_V2" + +@app.route('/tts', methods=['GET', 'POST'], strict_slashes=False) +def tts(): + try: + json_data = request.get_json() + text = json_data["input"] + if not isinstance(text,str): + input_type = type(text) + ret = jsonify(status='failure', reason=f"Unsupported input type {input_type}. Input text should be in string format.") + gender = json_data["gender"] + language = json_data["lang"].lower() + alpha = json_data["alpha"] + # Preprocess the sample text + preprocessed_text, phrases = preprocessor.preprocess(text, language, gender) + preprocessed_text = " ".join(preprocessed_text) + vocoder = vocoders[gender][language] + out = text_synthesis(language, gender, preprocessed_text, vocoder, MAX_WAV_VALUE, device, alpha=alpha) + + # output_file = f"{language}_{gender}_output.wav" + # write(output_file, SAMPLING_RATE, out) + # audio_wav_bytes = base64.b64encode(open(output_file, "rb").read()) + + # avoid saving file on disk + output_stream = io.BytesIO() + write(output_stream, SAMPLING_RATE, out) + audio_wav_bytes = base64.b64encode(output_stream.getvalue()) + + ret = jsonify(status="success",audio=audio_wav_bytes.decode('utf-8')) + + except Exception as err: + ret = jsonify(status="failure", reason=str(err)) + return ret + +if __name__ == '__main__': + # ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + # ssl_context.load_cert_chain('./ssl2023/iitm2022.crt','./ssl2023/iitm2022.key') + app.run(host='0.0.0.0', port=4005, debug=True) \ No newline at end of file diff --git a/get_phone_mapped_python.py b/get_phone_mapped_python.py new file mode 100644 index 0000000000000000000000000000000000000000..e18b9e11b5942cf1e8233d68c73efa83b2a61daf --- /dev/null +++ b/get_phone_mapped_python.py @@ -0,0 +1,76 @@ +class TextReplacer: + def __init__(self): + self.replacements = { + 'aa':'A', + 'ae':'ऍ', + 'ag':'ऽ', + 'ai':'ऐ', + 'au':'औ', + 'axx':'अ', + 'ax':'ऑ', + 'bh':'B', + 'ch':'C', + 'dh':'ध', + 'dxhq':'T', + 'dxh':'ढ', + 'dxq':'D', + 'dx':'ड', + 'ee':'E', + 'ei':'ऐ', + 'eu':'உ', + 'gh':'घ', + 'gq':'G', + 'hq':'H', + 'ii':'I', + 'jh':'J', + 'khq':'K', + 'kh':'ख', + 'kq':'क', + 'ln':'ൾ', + 'lw':'ൽ', + 'lx':'ള', + 'mq':'M', + 'nd':'ऩ', + 'ng':'ङ', + 'nj':'ञ', + 'nk':'Y', + 'nn':'N', + 'nw':'ൺ', + 'nx':'ण', + 'oo':'O', + 'ou':'औ', + 'ph':'P', + 'rqw':'ॠ', + 'rq':'R', + 'rw':'ർ', + 'rx':'ऱ', + 'sh':'श', + 'sx':'ष', + 'txh':'ठ', + 'th':'थ', + 'tx':'ट', + 'uu':'U', + 'wv':'W', + 'zh':'Z' + + # ... Add more replacements as needed + } + + + def apply_replacements(self, text): + for key, value in self.replacements.items(): + # print('KEY AND VALUE OF PARSED OUTPUT',key, value) + text = text.replace(key, value) + temp="" + for i in range(len(text)): + if text[i]!=" ": + temp=temp+text[i] + + return temp + + def apply_replacements_by_phonems(self, text): + ans=self.replacements[text] + # for key, value in self.replacements.items(): + # # print('KEY AND VALUE OF PARSED OUTPUT',key, value) + # text = text.replace(key, value) + return ans diff --git a/inference.py b/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..ad95d04f168ebbb3817d293b5b6d8c6266a434e0 --- /dev/null +++ b/inference.py @@ -0,0 +1,129 @@ +import sys +import os +#replace the path with your hifigan path to import Generator from models.py +sys.path.append("hifigan") +import argparse +import torch +from espnet2.bin.tts_inference import Text2Speech +from models import Generator +from scipy.io.wavfile import write +from meldataset import MAX_WAV_VALUE +from env import AttrDict +import json +import yaml +from text_preprocess_for_inference import TTSDurAlignPreprocessor, CharTextPreprocessor, TTSPreprocessor + +SAMPLING_RATE = 22050 + +def load_hifigan_vocoder(language, gender, device): + # Load HiFi-GAN vocoder configuration file and generator model for the specified language and gender + vocoder_config = f"vocoder/{gender}/aryan/hifigan/config.json" + vocoder_generator = f"vocoder/{gender}/aryan/hifigan/generator" + # Read the contents of the vocoder configuration file + with open(vocoder_config, 'r') as f: + data = f.read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + # Move the generator model to the specified device (CPU or GPU) + device = torch.device(device) + generator = Generator(h).to(device) + state_dict_g = torch.load(vocoder_generator, device) + generator.load_state_dict(state_dict_g['generator']) + generator.eval() + generator.remove_weight_norm() + + # Return the loaded and prepared HiFi-GAN generator model + return generator + + +def load_fastspeech2_model(language, gender, device): + + #updating the config.yaml fiel based on language and gender + with open(f"{language}/{gender}/model/config.yaml", "r") as file: + config = yaml.safe_load(file) + + current_working_directory = os.getcwd() + feat="model/feats_stats.npz" + pitch="model/pitch_stats.npz" + energy="model/energy_stats.npz" + + feat_path=os.path.join(current_working_directory,language,gender,feat) + pitch_path=os.path.join(current_working_directory,language,gender,pitch) + energy_path=os.path.join(current_working_directory,language,gender,energy) + + + config["normalize_conf"]["stats_file"] = feat_path + config["pitch_normalize_conf"]["stats_file"] = pitch_path + config["energy_normalize_conf"]["stats_file"] = energy_path + + with open(f"{language}/{gender}/model/config.yaml", "w") as file: + yaml.dump(config, file) + + tts_model = f"{language}/{gender}/model/model.pth" + tts_config = f"{language}/{gender}/model/config.yaml" + + + return Text2Speech(train_config=tts_config, model_file=tts_model, device=device) + +def text_synthesis(language, gender, sample_text, vocoder, MAX_WAV_VALUE, device, alpha): + # Perform Text-to-Speech synthesis + with torch.no_grad(): + # Load the FastSpeech2 model for the specified language and gender + + model = load_fastspeech2_model(language, gender, device) + + print('Alpha ', alpha) + + # Generate mel-spectrograms from the input text using the FastSpeech2 model + out = model(sample_text, decode_conf={"alpha": alpha}) + print("TTS Done") + x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262 + x = x.to(device) + + # Use the HiFi-GAN vocoder to convert mel-spectrograms to raw audio waveforms + y_g_hat = vocoder(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype('int16') + + # Return the synthesized audio + return audio + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Text-to-Speech Inference") + parser.add_argument("--language", type=str, required=True, help="Language (e.g., hindi)") + parser.add_argument("--gender", type=str, required=True, help="Gender (e.g., female)") + parser.add_argument("--sample_text", type=str, required=True, help="Text to be synthesized") + parser.add_argument("--output_file", type=str, default="", help="Output WAV file path") + parser.add_argument("--alpha", type=float, default=1, help="Alpha Parameter") + + args = parser.parse_args() + + phone_dictionary = {} + # Set the device + device = "cuda" if torch.cuda.is_available() else "cpu" + + # Load the HiFi-GAN vocoder with dynamic language and gender + vocoder = load_hifigan_vocoder(args.language, args.gender, device) + + if args.language == "urdu" or args.language == "punjabi": + preprocessor = CharTextPreprocessor() + elif args.language == "english": + preprocessor = TTSPreprocessor() + else: + preprocessor = TTSDurAlignPreprocessor() + + # Preprocess the sample text + preprocessed_text, phrases = preprocessor.preprocess(args.sample_text, args.language, args.gender, phone_dictionary) + preprocessed_text = " ".join(preprocessed_text) + + + audio = text_synthesis(args.language, args.gender, preprocessed_text, vocoder, MAX_WAV_VALUE, device, args.alpha) + if args.output_file: + output_file = f"{args.output_file}" + else: + output_file = f"{args.language}_{args.gender}_output.wav" + + write(output_file, SAMPLING_RATE, audio) diff --git a/ssn_parser_new/get_phone_mapped_text.py b/ssn_parser_new/get_phone_mapped_text.py new file mode 100644 index 0000000000000000000000000000000000000000..b13340efef9085c50b6b8de3f1ce044dae06faa0 --- /dev/null +++ b/ssn_parser_new/get_phone_mapped_text.py @@ -0,0 +1,73 @@ +import sys +import re + +def replace_in_file(file_path, replacements): + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + for search, replace in replacements.items(): + content = re.sub(search, replace, content) + + with open(file_path, 'w', encoding='utf-8') as file: + file.write(content) + +def main(): + in_file = sys.argv[1] + + replacements = { + '"aa"': '"A"', + '"ii"': '"I"', + '"uu"': '"U"', + '"ee"': '"E"', + '"oo"': '"O"', + '"nn"': '"N"', + '"ae"': '"ऍ"', + '"ag"': '"ऽ"', + '"au"': '"औ"', + '"ax"': '"ऑ"', + '"bh"': '"B"', + '"ch"': '"C"', + '"dh"': '"ध"', + '"dx"': '"ड"', + '"dxh"': '"ढ"', + '"dxhq"': '"ढ़"', + '"dxq"': '"ड़"', + '"ei"': '"ऐ"', + '"ai"': '"ऐ"', + '"eu"': '"उ"', + '"gh"': '"घ"', + '"gq"': '"ग़"', + '"hq"': '"H"', + '"jh"': '"J"', + '"kh"': '"ख"', + '"khq"': '"ख़"', + '"kq"': '"क़"', + '"ln"': '"ൾ"', + '"lw"': '"ൽ"', + '"lx"': '"ള"', + '"mq"': '"M"', + '"nd"': '"ऩ"', + '"ng"': '"ङ"', + '"nj"': '"ञ"', + '"nk"': '"़"', + '"nw"': '"ൺ"', + '"nx"': '"ण"', + '"ou"': '"औ"', + '"ph"': '"P"', + '"rq"': '"R"', + '"rqw"': '"ॠ"', + '"rw"': '"ർ"', + '"rx"': '"ऱ"', + '"sh"': '"श"', + '"sx"': '"ष"', + '"th"': '"थ"', + '"tx"': '"ट"', + '"txh"': '"ठ"', + '"wv"': '"W"', + '"zh"': '"Z"', + } + + replace_in_file(in_file, replacements) + +if __name__ == "__main__": + main() diff --git a/ssn_parser_new/lists/alphabets b/ssn_parser_new/lists/alphabets new file mode 100644 index 0000000000000000000000000000000000000000..de60a2a0b7300bb8595dfa0cc28989ffd3bc48fb --- /dev/null +++ b/ssn_parser_new/lists/alphabets @@ -0,0 +1,62 @@ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/ssn_parser_new/lists/cons b/ssn_parser_new/lists/cons new file mode 100644 index 0000000000000000000000000000000000000000..4589a582d2f1dd16aced83667479db481e6fea58 --- /dev/null +++ b/ssn_parser_new/lists/cons @@ -0,0 +1,130 @@ +க +ங +ச +ஜ +ஞ +ட +த +ந +ண +ன +ப +ம +ய +ர +ற +ல +ள +ழ +வ +ஷ +ஸ +ஹ +ஃ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +ळ +व +ष +श +स +ह +क्ष +ക +ഖ +ഗ +ഘ +ങ +ച +ഛ +ജ +ഝ +ഞ +ട +ഠ +ഡ +ഢ +ണ +ത +ഥ +ദ +ധ +ന +പ +ഫ +ബ +ഭ +മ +യ +ര +റ +ല +ള +ഴ +വ +ശ +ഷ +സ +ഹ +బ +భ +చ +ఛ +డ +ఢ +ద +ధ +ఫ +గ +ఘ +హ +జ +ఝ +క +ఖ +ల +ళ +మ +న +ణ +ప +ఞ +ఙ +ర +ఱ +ఋ +స +ష +శ +ట +ఠ +త +థ +వ +య diff --git a/ssn_parser_new/lists/dv b/ssn_parser_new/lists/dv new file mode 100644 index 0000000000000000000000000000000000000000..72c278a235120c5ef3e6baac486883bb8fc7f381 --- /dev/null +++ b/ssn_parser_new/lists/dv @@ -0,0 +1,56 @@ +் +ா +ி +ீ +ு +ூ +ெ +ே +ை +ொ +ோ +ௌ +ा +ि +ी +ु +ू +ृ +े +ै +ो +ौ +ं +ः +ँ +ം +ാ +ി +ീ +ു +ൂ +ൃ +െ +േ +ൈ +ൊ +ോ +ൌ +് +ഃ +ా +ి +ీ +ు +ూ +ృ +ె +ే +ై +ొ +ో +ౌ +ౖ +ఁ +ం +ః diff --git a/ssn_parser_new/lists/end_syl_list b/ssn_parser_new/lists/end_syl_list new file mode 100644 index 0000000000000000000000000000000000000000..07db8f82797d89c2fd8da2ebff1eda0f002c5b8d --- /dev/null +++ b/ssn_parser_new/lists/end_syl_list @@ -0,0 +1,404 @@ +zhoon +zhoo +yoong +yitx +yis +yins +yew +yek +yarsk +yalx +yaasx +yaalx +wur +wort +wizh +wisx +wis +wingg +windd +wiil +wiik +werp +weey +wees +weertx +was +wanx +wakt +wain +waih +waaw +waam +uum +txoosx +txoostx +txoomsk +txoom +txitx +txingg +txiir +txiim +txiil +txes +txel +txek +txeetxs +txeestx +txees +txeenxdx +txeel +txastx +txanxdx +txail +txaatxs +txaarc +txaak +tuu +trees +tooy +toon +too +tong +teew +teelx +taalx +sxoor +sxoonm +sxit +sxil +sxifr +sxar +sxair +sxaandd +suum +sung +stxoor +stxiitx +spings +spik +spek +soow +so +sketxc +skaarf +sink +singg +sim +sils +siil +sii +seyng +seyn +sep +sentx +sem +sees +sau +satxs +sartx +sans +rxulx +rxoytx +rxoo +rxiing +rxen +rxeel +rxars +rxaang +ruups +roow +roop +ritx +risks +riitxs +riir +riil +rektx +ratxs +rastx +raksx +raitxs +raas +raaptxs +puun +puum +praangg +poop +poons +pooltx +plxas +piy +pilxs +piir +piins +pes +pel +pek +peetxc +peesx +pars +pair +paayntx +paasx +paask +paartx +paalx +paah +oos +nxuu +nxung +nxoo +nxiing +nxantx +nxaar +nxaam +nook +njaar +ningg +niip +nii +ngin +nga +ng +neetx +neel +neej +ndooys +ndir +ndil +ndiips +ndafs +ndaastx +nd +natx +nastxm +naltx +nals +naitx +naays +naayk +naay +naangg +muuw +muurs +mooltx +mis +mirm +miiys +miit +miis +meesxr +maut +martx +marnd +mams +mac +maatx +maanxdx +lxur +lxing +lxiim +lxeen +lxeek +lxark +lxair +lxaaw +lxaark +lulx +loow +loom +lisx +lips +linj +lingg +liil +liik +leyng +letx +ler +leej +lars +lanxdx +laanxdx +laaks +laah +kwaang +kuurt +kris +kriis +kriim +kreem +kraim +kool +kir +kiptx +kings +kiizht +kiim +keym +kens +keetx +kees +keep +keems +keelx +kays +kanxdx +kails +kaantx +kaangg +kaah +juur +joons +jol +jiir +jen +jatx +jas +jars +jain +jaaw +jaas +i +hul +hraam +hoo +hon +hhan +heu +hee +he +har +haj +haars +haar +haap +gur +gulx +goos +goor +gis +gins +giizh +gels +geet +gaw +ganxdx +gals +gaastx +gaandd +gaam +gaaks +frsi +foors +faitx +fai +ert +elx +ef +ec +dxur +dxunx +dxos +dxoow +dxoor +dxoon +dxoom +dxingg +dxeesx +dxeej +dxasxk +dxas +dxaitxs +dxaas +dxaartx +dxaak +dxaaf +duur +duun +dun +dem +deesx +dees +deelx +day +darn +dams +dalx +daart +cuu +col +cin +cii +ceew +caaw +caatx +caar +bur +bunx +book +bisx +bins +bilxs +biing +biin +bert +benx +beetx +beesx +band +bals +baawtx +baas +baalx +baaltx +asxk +ars +ank +aas +aang +aam +aaktx +lxaa +dxeen +ma +jaa +sey +rxoom +lxulx +rxaay +daal +car +sis +diir +aa +txaal +ra +maam +woom +lxoo +see +wuut +rxaal +poom +paanxdx +neen +nas +lxa +las +him +hi +doom +cee +buu +boo +nxee +txeen +poo +noo +haa +deen +daay +puu +kaan diff --git a/ssn_parser_new/lists/english b/ssn_parser_new/lists/english new file mode 100644 index 0000000000000000000000000000000000000000..34f2e5371f0ecb558038a73f051e5d3ed4c75393 --- /dev/null +++ b/ssn_parser_new/lists/english @@ -0,0 +1,53 @@ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + diff --git a/ssn_parser_new/lists/english_tam_map b/ssn_parser_new/lists/english_tam_map new file mode 100644 index 0000000000000000000000000000000000000000..6294a568881f19fd4bca5c923b7eaa14c1544eb9 --- /dev/null +++ b/ssn_parser_new/lists/english_tam_map @@ -0,0 +1,409 @@ +#map file containing tamil caracters [ ] english caracters + + +! ! +@ @ +" " +# # +% % +' ' +( ( +) ) +* * ++ + +, , +_ _ +- - +. . +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +; ; +அ a +ஆ aa +இ i +ஈ ii +உ u +ஊ uu +எ e +ஏ ee +ஐ ai +ஒ o +ஓ oo +ஔ au +க ka +கா kaa +கி ki +கீ kii +கு ku +கூ kuu +கெ ke +கே kee +கை kai +கொ ko +கோ koo +கௌ kau +க் k +ங nga +ஙா ngaa +ஙி ngi +ஙீ ngii +ஙு ngu +ஙூ nguu +ஙெ nge +ஙே ngee +ஙை ngai +ஙொ ngo +ஙோ ngoo +ஙௌ ngau +ங் ng +ச ca +சா caa +சி ci +சீ cii +சு cu +சூ cuu +செ ce +சே cee +சை cai +சொ co +சோ coo +சௌ cau +ச் c +ஜ ja +ஜா jaa +ஜி ji +ஜீ jii +ஜு ju +ஜூ juu +ஜெ je +ஜே jee +ஜை jai +ஜொ jo +ஜோ joo +ஜௌ jau +ஜ் j +ஞ nja +ஞா njaa +ஞி nji +ஞீ njii +ஞு nju +ஞூ njuu +ஞெ nje +ஞே njee +ஞை njai +ஞொ njo +ஞோ njoo +ஞௌ njau +ஞ் nj +ட txa +டா txaa +டி txi +டீ txii +டு txu +டூ txuu +டெ txe +டே txee +டை txai +டொ txo +டோ txoo +டௌ txau +ட் tx +த ta +தா taa +தி ti +தீ tii +து tu +தூ tuu +தெ te +தே tee +தை tai +தொ to +தோ too +தௌ tau +த் t +ந nda +நா ndaa +நி ndi +நீ ndii +நு ndu +நூ nduu +நெ nde +நே ndee +நை ndai +நொ ndo +நோ ndoo +நௌ ndau +ந் nd +ண nxa +ணா nxaa +ணி nxi +ணீ nxii +ணு nxu +ணூ nxuu +ணெ nxe +ணே nxee +ணை nxai +ணொ nxo +ணோ nxoo +ணௌ nxau +ண் nx +ன na +னா naa +னி ni +னீ nii +னு nu +னூ nuu +னெ ne +னே nee +னை nai +னொ no +னோ noo +னௌ nau +ன் n +ப pa +பா paa +பி pi +பீ pii +பு pu +பூ puu +பெ pe +பே pee +பை pai +பொ po +போ poo +பௌ pau +ப் p +ம ma +மா maa +மி mi +மீ mii +மு mu +மூ muu +மெ me +மே mee +மை mai +மொ mo +மோ moo +மௌ mau +ம் m +ய ya +யா yaa +யி yi +யீ yii +யு yu +யூ yuu +யெ ye +யே yee +யை yai +யொ yo +யோ yoo +யௌ yau +ய் y +ர ra +ரா raa +ரி ri +ரீ rii +ரு ru +ரூ ruu +ரெ re +ரே ree +ரை rai +ரொ ro +ரோ roo +ரௌ rau +ர் r +ற rxa +றா rxaa +றி rxi +றீ rxii +று rxu +றூ rxuu +றெ rxe +றே rxee +றை rxai +றொ rxo +றோ rxoo +றௌ rxau +ற் rx +ல la +லா laa +லி li +ப pa +பா paa +பி pi +பீ pii +பு pu +பூ puu +பெ pe +பே pee +பை pai +பொ po +போ poo +பௌ pau +ப் p +லீ lii +லு lu +லூ luu +லெ le +லே lee +லை lai +லொ lo +லோ loo +லௌ lau +ல் l +ள lxa +ளா lxaa +ளி lxi +ளீ lxii +ளு lxu +ளூ lxuu +ளெ lxe +ளே lxee +ளை lxai +ளொ lxo +ளோ lxoo +ளௌ lxau +ள் lx +ழ zha +ழா zhaa +ழி zhi +ழீ zhii +ழு zhu +ழூ zhuu +ழெ zhe +ழே zhee +ழை zhai +ழொ zho +ழோ zhoo +ழௌ zhau +ழ் zh +வ wa +வா waa +வி wi +வீ wii +வு wu +வூ wuu +வெ we +வே wee +வை wai +வொ wo +வோ woo +வௌ wau +வ் w +ஷ sxa +ஷா sxaa +ஷி sxi +ஷீ sxii +ஷு sxu +ஷூ sxuu +ஷெ sxe +ஷே sxee +ஷை sxai +ஷொ sxao +ஷோ sxaoo +ஷௌ sxau +ஷ் sx +ஸ sa +ஸா saa +ஸி si +ஸீ sii +ஸு su +ஸூ suu +ஸெ se +ஸே see +ஸை sai +ஸொ so +ஸோ soo +ஸௌ sau +ஸ் s +ஹ ha +ஹா haa +ஹி hi +ஹீ hii +ஹு hu +ஹூ huu +ஹெ he +ஹே hee +ஹை hai +ஹொ ho +ஹோ hoo +ஹௌ hau +ஹ் h +ஃப fa +ஃபா faa +ஃபி fi +ஃபீ fii +ஃபு fu +ஃபூ fuu +ஃபெ fe +ஃபே fee +ஃபை fai +ஃபொ fo +ஃபோ foo +ஃபௌ fau +ஃப் f +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +अ a +आ aa +मं m +मा maa diff --git a/ssn_parser_new/lists/english_text_oald b/ssn_parser_new/lists/english_text_oald new file mode 100644 index 0000000000000000000000000000000000000000..215a29c994288470020ee5719113bd17f61a2b6f --- /dev/null +++ b/ssn_parser_new/lists/english_text_oald @@ -0,0 +1,46 @@ +uh a +e e +a ae +o aa +i i +u u +ii ii +uu uu +oo aa +aa aa +@@ ar +ai ai +ei ee +oi aay +au au +ou oo +e@ ee +i@ iiya +u@ uwa +@ a +p p +t tx +k k +b b +d dx +g g +s s +z s +sh sx +zh sx +f f +v w +th t +dh d +ch c +jh j +h h +m m +n nx +ng ng +l l +y y +r r +w w +# # +SIL diff --git a/ssn_parser_new/lists/f1 b/ssn_parser_new/lists/f1 new file mode 100644 index 0000000000000000000000000000000000000000..cde87749e364e64a6a651c074f053566a4a4c91e --- /dev/null +++ b/ssn_parser_new/lists/f1 @@ -0,0 +1,21 @@ +JJ +N_NN +N_NN +N_NN +V_VM_VNF_RP +V_VM_VF_VBN +N_NN +N_NN +V_VM_VNF_RP +N_NNP +QT_QTC +DM_DMR +RB +RB +V_VM_VNF_INF +N_NN +PR_PRP +N_NN +N_NN +V_VM_VF +N_NN diff --git a/ssn_parser_new/lists/f2 b/ssn_parser_new/lists/f2 new file mode 100644 index 0000000000000000000000000000000000000000..01015bde217bde369abce986f1bc73ae59a8f1c2 --- /dev/null +++ b/ssn_parser_new/lists/f2 @@ -0,0 +1,21 @@ +N_NN +V_VM_VF +V_VM_VF_VBN +V_VM_VNF_VBN +N_NN +V_VM_VF +PR_PRP +PSP +PSP +N_NN +N_NN +N_NN +V_VM_VF +V_VM_VNF_VBN +V_VM_VF +RB +PSP +RP_NEG +CC_CCS +CC_CCS +V_VM_VNF_COND diff --git a/ssn_parser_new/lists/gen.scp b/ssn_parser_new/lists/gen.scp new file mode 100644 index 0000000000000000000000000000000000000000..7f4020121db8543894b2475a56ad883caac982fb --- /dev/null +++ b/ssn_parser_new/lists/gen.scp @@ -0,0 +1 @@ +/home/rachel/ssn_hts_demo/lab/1.lab diff --git a/ssn_parser_new/lists/language_map_cp b/ssn_parser_new/lists/language_map_cp new file mode 100644 index 0000000000000000000000000000000000000000..4eaae09f3dfa7b0b343d2497cb0556f3ac914c34 --- /dev/null +++ b/ssn_parser_new/lists/language_map_cp @@ -0,0 +1,264 @@ + +! ! +@ @ +" " +# # +% % +' ' +( ( +) ) +* * ++ + +, , +_ _ +- - +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +; ; +: : +் +ா aa +ி i +ீ ii +ு u +ூ uu +ெ e +ே ee +ை ai +ொ o +ோ oo +ௌ au +அ a +ஆ aa +இ i +ஈ ii +உ u +ஊ uu +எ e +ஏ ee +ஐ ai +ஒ o +ஓ oo +ஔ au +க k +ங ng +ச c +ஜ j +ஞ nj +ட tx +த t +ந nd +ண nx +ன n +ப p +ம m +ய y +ர r +ற rx +ல l +ள lx +ழ zh +வ w +ஷ sx +ஸ s +ஹ h +ஃ g +ஃப f +् +ा aa +ि i +ी ii +ु u +ू uu +ृ rx +े ee +ै ai +ो oo +ौ au +ं n +ः aha +ँ n +अ a +आ aa +इ i +ई ii +उ u +ऊ uu +ऋ rx +ए ee +ऐ ai +ओ oo +औ au +क k +ख kh +ग g +घ gh +ङ ng +च c +छ ch +ज j +झ jh +ञ nj +ट tx +ठ txh +ड dx +ढ dxh +ण nx +त t +थ th +द d +ध dh +न nd +प p +फ ph +ब b +भ bh +म m +य y +र r +ल l +ळ lx +व w +ष sx +श sh +स s +ह h +क्ष ksh +അ a +ആ aa +ഇ i +ഈ ii +ഉ u +ഊ uu +ഋ rx +എ e +ഏ ee +ഐ ai +ഒ o +ഓ oo +ഔ au +ം m +് +ഃ +ാ aa +ി i +ീ ii +ു u +ൂ uu +ൃ rx +െ e +േ ee +ൈ ai +ൊ o +ോ oo +ൌ au +ക k +ഖ k +ഗ g +ഘ g +ങ nx +ച c +ഛ c +ജ j +ഝ j +ഞ nj +ട tx +ഠ tx +ഡ t +ഢ t +ണ nx +ത t +ഥ tx +ദ d +ധ d +ന nd +പ p +ഫ f +ബ b +ഭ b +മ m +യ y +ര r +റ rx +ല l +ള lx +ഴ zh +വ w +ശ sx +ഷ sh +സ s +ഹ h +ా aa +ి i +ీ ii +ు u +ూ uu +ృ rx +ె e +ే ee +ై ai +ొ o +ో oo +ౌ au +ౖ ai +ఁ n +ం n +ః aha +అ a +ఆ aa +ఇ i +ఈ ii +ఉ u +ఊ uu +ఎ e +ఏ ee +ఐ ai +ఒ o +ఓ oo +ఔ au +బ b +భ b +చ c +ఛ c +డ dx +ఢ dx +ద dh +ధ d +ఫ f +గ g +ఘ g +హ h +జ j +ఝ j +క k +ఖ k +ల l +ళ lx +మ m +న nd +ణ nx +ప p +ఞ nj +ఙ ng +ర r +ఱ rx +ఋ rx +స s +ష sh +శ sh +ట t +ఠ tx +త th +థ t +వ w +య y +SIL SIL diff --git a/ssn_parser_new/lists/num b/ssn_parser_new/lists/num new file mode 100644 index 0000000000000000000000000000000000000000..8b1acc12b635c26f3decadeaa251729d3ce512e9 --- /dev/null +++ b/ssn_parser_new/lists/num @@ -0,0 +1,10 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/ssn_parser_new/lists/number b/ssn_parser_new/lists/number new file mode 100644 index 0000000000000000000000000000000000000000..e53eaa178f3b8f3be4178b6cb120265e899d524f --- /dev/null +++ b/ssn_parser_new/lists/number @@ -0,0 +1,10 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 diff --git a/ssn_parser_new/lists/o_au_map b/ssn_parser_new/lists/o_au_map new file mode 100644 index 0000000000000000000000000000000000000000..1606dcf1931f4e98bb6db3bb290ba503a40ccdff --- /dev/null +++ b/ssn_parser_new/lists/o_au_map @@ -0,0 +1,3 @@ +e aa o +ee aa oo +e lx au diff --git a/ssn_parser_new/lists/out_word b/ssn_parser_new/lists/out_word new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ssn_parser_new/lists/pb_pos_list_12hrs b/ssn_parser_new/lists/pb_pos_list_12hrs new file mode 100644 index 0000000000000000000000000000000000000000..138eaf3802afd782c400dff189c08370a8c300be --- /dev/null +++ b/ssn_parser_new/lists/pb_pos_list_12hrs @@ -0,0 +1,9 @@ +V_VM_VNG +V_VM_VF +RP_INJ +V_VM_VNF_COND +N_NNV +V_VM_VNF_RP_PSP +RP_NEG +PSP +CC_CCD diff --git a/ssn_parser_new/lists/phone_list b/ssn_parser_new/lists/phone_list new file mode 100644 index 0000000000000000000000000000000000000000..955cf5849aed168da8261e8833d8f7bebf314b9e --- /dev/null +++ b/ssn_parser_new/lists/phone_list @@ -0,0 +1,42 @@ +a +aa +ai +au +b +c +d +dx +e +ee +eu +f +g +h +i +ii +j +k +l +lx +m +n +nd +ng +nj +nx +o +oo +p +r +rx +s +SIL +sx +t +tx +u +uu +w +y +zh +ae diff --git a/ssn_parser_new/lists/phoneset_all b/ssn_parser_new/lists/phoneset_all new file mode 100644 index 0000000000000000000000000000000000000000..5900588755d0e6e774281be769ebfb7018ea752f --- /dev/null +++ b/ssn_parser_new/lists/phoneset_all @@ -0,0 +1,419 @@ +a +aa +ae +ai +au +b +ba +baa +bai +bau +be +bee +beu +bi +bii +bo +boo +bu +buu +c +ca +caa +cai +cau +ce +cee +ceu +ci +cii +co +coo +cu +cuu +d +da +daa +dai +dau +de +dee +deu +di +dii +do +doo +du +duu +dx +dxa +dxaa +dxai +dxau +dxe +dxee +dxeu +dxi +dxii +dxo +dxoo +dxu +dxuu +e +ee +eu +f +fa +faa +fai +fau +fe +fee +feu +fi +fii +fo +foo +fu +fuu +g +ga +gaa +gai +gau +ge +gee +geu +gi +gii +go +goo +gu +guu +h +ha +haa +hai +hau +he +hee +heu +hi +hii +ho +hoo +hu +huu +i +ii +j +ja +jaa +jai +jau +je +jee +jeu +ji +jii +jo +joo +ju +juu +k +ka +kaa +kai +kau +ke +kee +keu +ki +kii +ko +koo +ku +kuu +l +la +laa +lai +lau +le +lee +leu +li +lii +lo +loo +lu +luu +lx +lxa +lxaa +lxai +lxau +lxe +lxee +lxeu +lxi +lxii +lxo +lxoo +lxu +lxuu +m +ma +maa +mai +mau +me +mee +meu +mi +mii +mo +moo +mu +muu +n +na +naa +nai +nau +nd +nda +ndaa +ndai +ndau +nde +ndee +ndeu +ndi +ndii +ndo +ndoo +ndu +nduu +ne +nee +neu +ng +nga +ngaa +ngai +ngau +nge +ngee +ngeu +ngi +ngii +ngo +ngoo +ngu +nguu +ni +nii +nj +nja +njaa +njai +njau +nje +njee +njeu +nji +njii +njo +njoo +nju +njuu +no +noo +nu +nuu +nx +nxa +nxaa +nxai +nxau +nxe +nxee +nxeu +nxi +nxii +nxo +nxoo +nxu +nxuu +o +oo +p +pa +paa +pai +pau +pe +pee +peu +pi +pii +po +poo +pu +puu +r +ra +raa +rai +rau +re +ree +reu +ri +rii +ro +roo +ru +ruu +rx +rxa +rxaa +rxai +rxau +rxe +rxee +rxeu +rxi +rxii +rxo +rxoo +rxu +rxuu +s +sa +saa +sai +sau +se +see +seu +si +sii +so +soo +su +suu +sx +sxa +sxaa +sxai +sxau +sxe +sxee +sxeu +sxi +sxii +sxo +sxoo +sxu +sxuu +t +ta +taa +tai +tau +te +tee +teu +ti +tii +to +too +tu +tuu +tx +txa +txaa +txai +txau +txe +txee +txeu +txi +txii +txo +txoo +txu +txuu +u +uu +w +wa +waa +wai +wau +we +wee +weu +wi +wii +wo +woo +wu +wuu +y +ya +yaa +yai +yau +ye +yee +yeu +yi +yii +yo +yoo +yu +yuu +zh +zha +zhaa +zhai +zhau +zhe +zhee +zheu +zhi +zhii +zho +zhoo +zhu +zhuu +bae +cae +dae +dxae +fae +gae +hae +jae +kae +lae +lxae +mae +nae +ndae +ngae +njae +nxae +pae +rae +rxae +sae +sxae +tae +txae +wae +yae +zhae diff --git a/ssn_parser_new/lists/phoneset_mei b/ssn_parser_new/lists/phoneset_mei new file mode 100644 index 0000000000000000000000000000000000000000..ed88f194b9652e4555f44fd444dc5ba148b95766 --- /dev/null +++ b/ssn_parser_new/lists/phoneset_mei @@ -0,0 +1,27 @@ +b +c +d +dx +f +g +h +j +k +l +lx +m +n +nd +ng +nj +nx +p +r +rx +s +sx +t +tx +w +y +zh diff --git a/ssn_parser_new/lists/phoneset_uyir b/ssn_parser_new/lists/phoneset_uyir new file mode 100644 index 0000000000000000000000000000000000000000..9afb6890a23c31e8cb597008834aa4fa01153000 --- /dev/null +++ b/ssn_parser_new/lists/phoneset_uyir @@ -0,0 +1,14 @@ +a +aa +ai +au +e +ee +eu +i +ii +o +oo +u +uu +ae diff --git a/ssn_parser_new/lists/phoneset_uyirmei b/ssn_parser_new/lists/phoneset_uyirmei new file mode 100644 index 0000000000000000000000000000000000000000..34d65fae00079d946e86b642d4565e797319dfc6 --- /dev/null +++ b/ssn_parser_new/lists/phoneset_uyirmei @@ -0,0 +1,378 @@ +ba +baa +bai +bau +be +bee +beu +bi +bii +bo +boo +bu +buu +ca +caa +cai +cau +ce +cee +ceu +ci +cii +co +coo +cu +cuu +da +daa +dai +dau +de +dee +deu +di +dii +do +doo +du +duu +dxa +dxaa +dxai +dxau +dxe +dxee +dxeu +dxi +dxii +dxo +dxoo +dxu +dxuu +fa +faa +fai +fau +fe +fee +feu +fi +fii +fo +foo +fu +fuu +ga +gaa +gai +gau +ge +gee +geu +gi +gii +go +goo +gu +guu +ha +haa +hai +hau +he +hee +heu +hi +hii +ho +hoo +hu +huu +ja +jaa +jai +jau +je +jee +jeu +ji +jii +jo +joo +ju +juu +ka +kaa +kai +kau +ke +kee +keu +ki +kii +ko +koo +ku +kuu +la +laa +lai +lau +le +lee +leu +li +lii +lo +loo +lu +luu +lxa +lxaa +lxai +lxau +lxe +lxee +lxeu +lxi +lxii +lxo +lxoo +lxu +lxuu +ma +maa +mai +mau +me +mee +meu +mi +mii +mo +moo +mu +muu +na +naa +nai +nau +ne +nee +neu +ni +nii +no +noo +nu +nuu +nda +ndaa +ndai +ndau +nde +ndee +ndeu +ndi +ndii +ndo +ndoo +ndu +nduu +nga +ngaa +ngai +ngau +nge +ngee +ngeu +ngi +ngii +ngo +ngoo +ngu +nguu +nja +njaa +njai +njau +nje +njee +njeu +nji +njii +njo +njoo +nju +njuu +nxa +nxaa +nxai +nxau +nxe +nxee +nxeu +nxi +nxii +nxo +nxoo +nxu +nxuu +pa +paa +pai +pau +pe +pee +peu +pi +pii +po +poo +pu +puu +ra +raa +rai +rau +re +ree +reu +ri +rii +ro +roo +ru +ruu +rxa +rxaa +rxai +rxau +rxe +rxee +rxeu +rxi +rxii +rxo +rxoo +rxu +rxuu +sa +saa +sai +sau +se +see +seu +si +sii +so +soo +su +suu +sxa +sxaa +sxai +sxau +sxe +sxee +sxeu +sxi +sxii +sxo +sxoo +sxu +sxuu +ta +taa +tai +tau +te +tee +teu +ti +tii +to +too +tu +tuu +txa +txaa +txai +txau +txe +txee +txeu +txi +txii +txo +txoo +txu +txuu +wa +waa +wai +wau +we +wee +weu +wi +wii +wo +woo +wu +wuu +ya +yaa +yai +yau +ye +yee +yeu +yi +yii +yo +yoo +yu +yuu +zha +zhaa +zhai +zhau +zhe +zhee +zheu +zhi +zhii +zho +zhoo +zhu +zhuu +bae +cae +dae +dxae +fae +gae +hae +jae +kae +lae +lxae +mae +nae +ndae +ngae +njae +nxae +pae +rae +rxae +sae +sxae +tae +txae +wae +yae +zhae diff --git a/ssn_parser_new/lists/spl_chr b/ssn_parser_new/lists/spl_chr new file mode 100644 index 0000000000000000000000000000000000000000..8467a526f1e27ba524b42a00fee0d89f06d3ba7c --- /dev/null +++ b/ssn_parser_new/lists/spl_chr @@ -0,0 +1,26 @@ +! +@ +# +$ +% +^ +& +* +( +) ++ += +{ +} +[ +] +" +; +' +< +> +, +. +? +“ +” diff --git a/ssn_parser_new/lists/spl_chr_map b/ssn_parser_new/lists/spl_chr_map new file mode 100644 index 0000000000000000000000000000000000000000..6f55e3025005aac29b3a5c81b89d6c0a178c0403 --- /dev/null +++ b/ssn_parser_new/lists/spl_chr_map @@ -0,0 +1,27 @@ +! +@ at அட் +# hash ஹேஷ் +$ dollar டாலர் +% percent சதவிகிதம் +^ +& and மற்றும் +* +( +) ++ plus கூட்டல் += equal to சமம் +{ +} +[ +] +" +; +' +< less than லெஸெர் தன் +> greater than க்ரேட்டர் தன் +, , +. . +? +₹ rupees ருபாய் +“ +” diff --git a/ssn_parser_new/lists/sv b/ssn_parser_new/lists/sv new file mode 100644 index 0000000000000000000000000000000000000000..fc19819355156ce21abbead7f1c34678d3faf6e0 --- /dev/null +++ b/ssn_parser_new/lists/sv @@ -0,0 +1,6 @@ +y +r +l +lx +zh +w diff --git a/ssn_parser_new/lists/syl_list b/ssn_parser_new/lists/syl_list new file mode 100644 index 0000000000000000000000000000000000000000..699e96e0833209f84eaab874e753c6c9abbb58f7 --- /dev/null +++ b/ssn_parser_new/lists/syl_list @@ -0,0 +1,24 @@ +deu +galx +yum +na +ga +keu +dxeu +lai +rxeu +yil +teu +til +txeu +kum +lum +naal +nar +dxum +daan +lxai +yaa +laam +war +lxum diff --git a/ssn_parser_new/lists/tamil b/ssn_parser_new/lists/tamil new file mode 100644 index 0000000000000000000000000000000000000000..de0eb7ea45c0fea758e176270283e10eb9af52f5 --- /dev/null +++ b/ssn_parser_new/lists/tamil @@ -0,0 +1,48 @@ +அ +ஆ +இ +ஈ +உ +ஊ +எ +ஏ +ஐ +ஒ +ஓ +ஔ +க +ங +ச +ஜ +ஞ +ட +த +ந +ண +ன +ப +ம +ய +ர +ற +ல +ள +ழ +வ +ஷ +ஸ +ஹ +ஃப +ா +ி +ீ +ு +ூ +ெ +ே +ை +ொ +ோ +ௌ +் + diff --git a/ssn_parser_new/lists/tamil_map b/ssn_parser_new/lists/tamil_map new file mode 100644 index 0000000000000000000000000000000000000000..28a8669d2c8a90fad9049c85280df523cca801ca --- /dev/null +++ b/ssn_parser_new/lists/tamil_map @@ -0,0 +1,150 @@ +# map file containing tamil characters [ ] english characters +# Multiple phonemes mapped to a single character are yet to be handled. +# Has 12 vowels,(18+5) consonants. +! ! +@ @ +" " +# # +% % +' ' +( ( +) ) +* * ++ + +, , +_ _ +- - +. . +/ / +; ; +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +ா aa +ி i +ீ ii +ு u +ூ uu +ெ e +ே ee +ை ai +ொ o +ோ oo +ௌ au +அ a +ஆ aa +இ i +ஈ ii +உ u +ஊ uu +எ e +ஏ ee +ஐ ai +ஒ o +ஓ oo +ஔ au +க k +க் k +ங ng +ங் ng +ச c +ச் c +ஜ j +ஜ் j +ஞ nj +ஞ் nj +ட tx +ட் tx +த t +த் t +ந nd +ந் nd +ண nx +ண் nx +ன n +ன் n +ப p +ப் p +ம m +ம் m +ய y +ய் y +ர r +ர் r +ற rx +ற் rx +ல l +ல் l +ள lx +ள் lx +ழ zh +ழ் zh +வ w +வ் w +ஷ sx +ஷ் sx +ஸ s +ஸ் s +ஹ h +ஹ் h +ஃப f +ஃப் f diff --git a/ssn_parser_new/lists/u_list b/ssn_parser_new/lists/u_list new file mode 100644 index 0000000000000000000000000000000000000000..9ca16784b002aab85ddc44b9f6198059dfd9d073 --- /dev/null +++ b/ssn_parser_new/lists/u_list @@ -0,0 +1,4 @@ +k +t +c +p diff --git a/ssn_parser_new/lists/vowel_list b/ssn_parser_new/lists/vowel_list new file mode 100644 index 0000000000000000000000000000000000000000..b4955ad8646c6691e9d6170810a61a943366085e --- /dev/null +++ b/ssn_parser_new/lists/vowel_list @@ -0,0 +1,13 @@ +a +aa +i +ii +u +uu +e +ee +ai +o +oo +au +eu diff --git a/ssn_parser_new/lists/vowels b/ssn_parser_new/lists/vowels new file mode 100644 index 0000000000000000000000000000000000000000..cdda51346fe0366296d4c8bd41ab6c30a7002423 --- /dev/null +++ b/ssn_parser_new/lists/vowels @@ -0,0 +1,48 @@ +அ +ஆ +இ +ஈ +உ +ஊ +எ +ஏ +ஐ +ஒ +ஓ +ஔ +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ओ +औ +അ +ആ +ഇ +ഈ +ഉ +ഊ +ഋ +എ +ഏ +ഐ +ഒ +ഓ +ഔ +అ +ఆ +ఇ +ఈ +ఉ +ఊ +ఎ +ఏ +ఐ +ఒ +ఓ +ఔ diff --git a/ssn_parser_new/lists/vuv_list b/ssn_parser_new/lists/vuv_list new file mode 100644 index 0000000000000000000000000000000000000000..f431d32be521eb668dd39ac999fb9e0844addf0a --- /dev/null +++ b/ssn_parser_new/lists/vuv_list @@ -0,0 +1,4 @@ +k g ng +t d nd +p b m +tx dx nx diff --git a/ssn_parser_new/lists/word1 b/ssn_parser_new/lists/word1 new file mode 100644 index 0000000000000000000000000000000000000000..4d6015513f437421d5a6978f6270b21853e0d88a --- /dev/null +++ b/ssn_parser_new/lists/word1 @@ -0,0 +1 @@ +முயற்சியை diff --git a/ssn_parser_new/non_parallel-parser.py b/ssn_parser_new/non_parallel-parser.py new file mode 100644 index 0000000000000000000000000000000000000000..15dce52690bc6a65e1855bd9affc5803bf38df88 --- /dev/null +++ b/ssn_parser_new/non_parallel-parser.py @@ -0,0 +1,93 @@ +import sys +import os +import subprocess + +def process_word(word, phone_file_name): + with open('tempword', 'w') as tempword_file: + tempword_file.write(word) + + + os.system('python scripts/vul.py tempword 2> temp_output_string') + + output = '' + out_str = '' + + with open('lists/out_word') as out_word_file: + output = out_word_file.read() + + with open('temp_output_string') as temp_output_string_file: + out_str = temp_output_string_file.read() + + if out_str != '': + with open(f'{phone_file_name}.err', 'a') as err_file: + err_file.write(word + '\n') + else: + with open(f'{phone_file_name}.words', 'a') as words_file: + words_file.write(word + '\n') + with open(f'{phone_file_name}.cls', 'a') as cls_file: + cls_file.write(output + '\n') + + os.system('rm -rf phn tempword lists/tmp lists/nasal lists/trans_word lists/out_word') + +def main(): + if len(sys.argv) != 5: + print("Usage: python script.py unique_words output_file_name parser_path rand_num") + sys.exit(1) + + unique_words = sys.argv[1] + output_file_name = sys.argv[2] + parser_path = sys.argv[3] + rand_num = sys.argv[4] + phone_file_name = 'phone_out_file' + + os.system(f'cp {unique_words} {parser_path}/') + curr_path = os.getcwd() + os.chdir(parser_path) + + os.system(f'rm {phone_file_name}.words {phone_file_name}.cls {phone_file_name}.err {phone_file_name}') + os.system('rm -rf temp_output_string phn tempword lists/tmp lists/nasal lists/trans_word lists/out_word') + + with open(unique_words) as unique_words_file: + for word in unique_words_file: + process_word(word.strip(), phone_file_name) + + os.system(f'rm -rf temp_output_string phn tempword lists/tmp lists/nasal lists/trans_word lists/out_word') + + os.system(f'cp {phone_file_name}.cls {phone_file_name}') + os.system(f'sed -i \'s/ /""/g\' {phone_file_name}') + os.system(f'sed -i \'s/^/""/g\' {phone_file_name}') + os.system(f'sed -i \'s/$/""/g\' {phone_file_name}') + subprocess.run(['python', 'get_phone_mapped_text.py', phone_file_name]) + os.system(f'sed -i \'s/"//g\' {phone_file_name}') + os.system(f'sed -i \'s/ //g\' {phone_file_name}') + + words_str = '' + with open(f'{phone_file_name}.words') as words_file: + words_str = words_file.read() + + if words_str != '': + os.system(f'paste -d\'\\t\' {phone_file_name}.words {phone_file_name} > {output_file_name}') + else: + os.system(f'touch {output_file_name}') + + err_str = '' + # with open(f'{phone_file_name}.err') as err_file: + # err_str = err_file.read() + + try: + with open(f'{phone_file_name}.err') as err_file: + err_str = err_file.read() + except FileNotFoundError: + # File not found, create the file + with open(f'{phone_file_name}.err', 'w') as err_file: + # Optionally, you can write some initial content to the file + err_file.write(f'Error {FileNotFoundError}') + + # if err_str != '': + # os.system(f'bash phonify_wrapper.sh {parser_path}/{phone_file_name}.err {output_file_name}.err.out {rand_num} {curr_path}/ssn_parser/') + # os.system(f'cat {output_file_name}.err.out >> {output_file_name}') + + os.chdir(curr_path) + +if __name__ == "__main__": + main() diff --git a/ssn_parser_new/output b/ssn_parser_new/output new file mode 100644 index 0000000000000000000000000000000000000000..763d84ba0e2df65fb4ecd49bf7053a40315caa2d --- /dev/null +++ b/ssn_parser_new/output @@ -0,0 +1,9 @@ +a +ऐadimuga +af +afkamum +afkam +afkAmऐ +afki +aftaङgஉk +aftaडuङgAl diff --git a/ssn_parser_new/output.cls b/ssn_parser_new/output.cls new file mode 100644 index 0000000000000000000000000000000000000000..7baa37cb178b47c8d644cfbe4768cd1f6df13731 --- /dev/null +++ b/ssn_parser_new/output.cls @@ -0,0 +1,9 @@ +a +ai a d i m u g a +a f +a f k a m u m +a f k a m +a f k aa m ai +a f k i +a f t a ng g eu k +a f t a dx u ng g aa l diff --git a/ssn_parser_new/output.err b/ssn_parser_new/output.err new file mode 100644 index 0000000000000000000000000000000000000000..dd803ab52761812561bdf8570f470eacf1705750 --- /dev/null +++ b/ssn_parser_new/output.err @@ -0,0 +1 @@ +௨ diff --git a/ssn_parser_new/output.words b/ssn_parser_new/output.words new file mode 100644 index 0000000000000000000000000000000000000000..070105087d7c71a099df7f40061b71a84bd41f32 --- /dev/null +++ b/ssn_parser_new/output.words @@ -0,0 +1,9 @@ +அ +அஇஅதிமுக +அஃ +அஃகமும் +அஃகம் +அஃகாமை +அஃகி +அஃதங்குக் +அஃதடுங்கால் diff --git a/ssn_parser_new/phone_out_file b/ssn_parser_new/phone_out_file new file mode 100644 index 0000000000000000000000000000000000000000..aece774538d3f7982c1ed3c186160172687cbc28 --- /dev/null +++ b/ssn_parser_new/phone_out_file @@ -0,0 +1,9 @@ +a +ऐadimuga +af +afkamum +afkam +afkAmऐ +afki +aftaङgउk +aftaडuङgAl diff --git a/ssn_parser_new/phone_out_file.cls b/ssn_parser_new/phone_out_file.cls new file mode 100644 index 0000000000000000000000000000000000000000..7baa37cb178b47c8d644cfbe4768cd1f6df13731 --- /dev/null +++ b/ssn_parser_new/phone_out_file.cls @@ -0,0 +1,9 @@ +a +ai a d i m u g a +a f +a f k a m u m +a f k a m +a f k aa m ai +a f k i +a f t a ng g eu k +a f t a dx u ng g aa l diff --git a/ssn_parser_new/phone_out_file.err b/ssn_parser_new/phone_out_file.err new file mode 100644 index 0000000000000000000000000000000000000000..dd803ab52761812561bdf8570f470eacf1705750 --- /dev/null +++ b/ssn_parser_new/phone_out_file.err @@ -0,0 +1 @@ +௨ diff --git a/ssn_parser_new/phone_out_file.words b/ssn_parser_new/phone_out_file.words new file mode 100644 index 0000000000000000000000000000000000000000..070105087d7c71a099df7f40061b71a84bd41f32 --- /dev/null +++ b/ssn_parser_new/phone_out_file.words @@ -0,0 +1,9 @@ +அ +அஇஅதிமுக +அஃ +அஃகமும் +அஃகம் +அஃகாமை +அஃகி +அஃதங்குக் +அஃதடுங்கால் diff --git a/ssn_parser_new/phonify_wrapper.py b/ssn_parser_new/phonify_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..c89ddcf54dfd64e1e740a11b15e0c87e9421d132 --- /dev/null +++ b/ssn_parser_new/phonify_wrapper.py @@ -0,0 +1,50 @@ +import os +import subprocess + +def main(): + inpFile = sys.argv[1] + outFile = sys.argv[2] + randNum = sys.argv[3] + basePath = sys.argv[4] + currPath = os.getcwd() + unifParFold = os.path.join(basePath, 'unified_parser') + uniParOut = f'.uniOut_{randNum}.txt' + uniParList = inpFile + uniParTemp = f'.uniTemp_{randNum}.txt' + + print("The data is successfully reached") + os.chdir(unifParFold) + os.mkdir(f'uniPar_{randNum}') + + nj = int(subprocess.check_output(['wc', '-l', inpFile]).decode().split()[0]) # number of parallel jobs + if nj > 48: + nj = 48 + + with open(uniParList, 'r') as infile: + with open(uniParTemp, 'w') as tempfile: + for i, line in enumerate(infile, start=1): + tempfile.write(f"{line.rstrip()}\tuniPar_{randNum}/word_{i:04d}.txt\n") + + command = f"awk '{{printf \"%s\\tuniPar_{randNum}/word_%04d.txt\\n\", $0, NR}}' {uniParList} | \ + parallel -j {nj} --colsep '\t' 'valgrind ./unified-parser {{1}} {{2}} 1 0 0 0 > /dev/null 2> /dev/null' > /dev/null 2> /dev/null" + subprocess.run(command, shell=True, check=True) + + os.system(f"cat uniPar_{randNum}/*.txt > {uniParTemp}") + os.rmdir(f'uniPar_{randNum}') + + subprocess.run(['bash', 'get_phone_mapped_text_updated.sh', uniParTemp, uniParOut]) + + os.system(f"sed -i \"s:^(set! wordstruct '::g\" {uniParOut}") + os.system(f"sed -i 's:[)(\"0 ]::g' {uniParOut}") + + command = f"paste -d' ' {uniParList} {uniParOut} >> {outFile}" + os.system(command) + + os.remove(uniParTemp) + os.remove(uniParOut) + + os.chdir(currPath) + +if __name__ == "__main__": + import sys + main() diff --git a/ssn_parser_new/scripts/ortho_to_phonetic1.py b/ssn_parser_new/scripts/ortho_to_phonetic1.py new file mode 100644 index 0000000000000000000000000000000000000000..53b0be375840aefa3beaaeff50a887896be764bd --- /dev/null +++ b/ssn_parser_new/scripts/ortho_to_phonetic1.py @@ -0,0 +1,75 @@ +import re + +def cat(file): + with open(file, 'r') as f: + return f.read() + +def ortho_to_phonetic(input_file, phone_list_file, output_file): + with open(input_file, 'r') as f: + words = f.read().split() + + with open(phone_list_file, 'r') as f: + phone_list = set(f.read().splitlines()) + + word_start = 0 + with open(output_file, 'w') as phn_handle: + while word_start < len(words): + word = words[word_start] + if word != "SIL": + num = len(word) + phone_start1 = 0 + while phone_start1 < num: + p1 = word[phone_start1:phone_start1 + 2] + p2 = word[phone_start1:phone_start1 + 3] + p3 = word[phone_start1:phone_start1 + 4] + p4 = word[phone_start1:phone_start1 + 5] + p5 = word[phone_start1:phone_start1 + 6] + + cou = len(set(re.findall(rf'\b{re.escape(p1)}\b', cat(phone_list_file)))) + cou1 = len(set(re.findall(rf'\b{re.escape(p2)}\b', cat(phone_list_file)))) + cou2 = len(set(re.findall(rf'\b{re.escape(p3)}\b', cat(phone_list_file)))) + cou3 = len(set(re.findall(rf'\b{re.escape(p4)}\b', cat(phone_list_file)))) + cou4 = len(set(re.findall(rf'\b{re.escape(p5)}\b', cat(phone_list_file)))) + + + + + if cou4 == 1: + phn_handle.write(p5 + "\n") + phone_start1 += 6 + elif cou3 == 1: + phn_handle.write(p4 + "\n") + phone_start1 += 5 + elif cou2 == 1: + phn_handle.write(p3 + "\n") + phone_start1 += 4 + elif cou1 == 1: + phn_handle.write(p2 + "\n") + phone_start1 += 3 + elif cou == 1: + phn_handle.write(p1 + "\n") + phone_start1 += 2 + else: + p1 = word[phone_start1] + if p1 in [",", "."]: + phone_start1 += 1 + else: + phn_handle.write(p1 + "\n") + phone_start1 += 1 + else: + phn_handle.write("SIL\n") + break + word_start += 1 + +if __name__ == "__main__": + import sys + + if len(sys.argv) != 4: + print("Usage: python script.py input_file phone_list output_file") + sys.exit(0) + + #print("Test -- 6") + input_file, phone_list_file, output_file = sys.argv[1], sys.argv[2], sys.argv[3] + + #print("output_file", output_file) + ortho_to_phonetic(input_file, phone_list_file, output_file) diff --git a/ssn_parser_new/scripts/tamil_trans_py b/ssn_parser_new/scripts/tamil_trans_py new file mode 100644 index 0000000000000000000000000000000000000000..db178960f9b6febea49a24fc384068fc70be9071 --- /dev/null +++ b/ssn_parser_new/scripts/tamil_trans_py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:764578b6dceb6b64d25fffd712738861a3dc29914920d022d52d7f96b68e700a +size 17577056 diff --git a/ssn_parser_new/scripts/tamil_trans_py.py b/ssn_parser_new/scripts/tamil_trans_py.py new file mode 100644 index 0000000000000000000000000000000000000000..979cc4820f340877b849cae500761ca2417a0132 --- /dev/null +++ b/ssn_parser_new/scripts/tamil_trans_py.py @@ -0,0 +1,104 @@ +import sys + +class TableEntry: + def __init__(self): + self.tamil = "" + self.english = "" + +def is_d_v(character, d_v): + return character in d_v + +def is_non_printable(character): + return character in {'\n', '\r', '\t', ' '} + +def transliterate(token, tamil_map): + for entry in tamil_map: + if token == entry.tamil: + return entry.english + print(f"No English character in the map file for {token}") + return None + +def main(): + if len(sys.argv) == 2: + map_file = open(sys.argv[1], "r") + input_text = sys.stdin + output_text = sys.stdout + elif len(sys.argv) == 4: + map_file = open(sys.argv[1], "r") + input_text = open(sys.argv[2], "r") + output_text = open(sys.argv[3], "w") + else: + print("./tamil_english map_file or\n./tamil_english map_file input output") + return 3 + + tamil_map = [] + n_characters = 0 + + for line in map_file: + if line[0] == '#': + continue + tamil, english = line.split() + entry = TableEntry() + entry.tamil = tamil + entry.english = english + tamil_map.append(entry) + n_characters += 1 + + d_v = ['\u0BBE', '\u0BBF', '\u0BC0', '\u0BC1', '\u0BC2', '\u0BC6', '\u0BC7', '\u0BC8', '\u0BCA', '\u0BCB', '\u0BCC'] + vowels = ['\u0B85', '\u0B86', '\u0B87', '\u0B88', '\u0B89', '\u0B8A', '\u0B8E', '\u0B8F', '\u0B90', '\u0B92', '\u0B93', '\u0B94', '\u0B83'] + + character_previous = '' + character = input_text.read(1) + character_next = input_text.read(1) + + while character: + pos = input_text.tell() + + if is_non_printable(character): + print(f"\nNon Printable\tSS{character}SS") + output_text.write(character) + elif is_d_v(character, vowels): + if character == '\u0B83' and character_next == '\u0BAA': + token = character + character_transliterated = transliterate(token, tamil_map) + if character_transliterated is not None: + output_text.write(character_transliterated) + else: + token = character + input_text.seek(pos) + character_transliterated = transliterate(token, tamil_map) + if character_transliterated is not None: + output_text.write(character_transliterated) + else: + if character_next == '\u0BCD': + token = character + character_next + character_transliterated = transliterate(token, tamil_map) + if character_transliterated is not None: + output_text.write(character_transliterated) + elif is_d_v(character, d_v): + token = character + input_text.seek(pos) + character_transliterated = transliterate(token, tamil_map) + if character_transliterated is not None: + output_text.write(character_transliterated) + elif not is_d_v(character_next, d_v): + token = character + input_text.seek(pos) + character_transliterated = transliterate(token, tamil_map) + if character_transliterated is not None: + if token == character_transliterated: + output_text.write(character_transliterated) + else: + output_text.write(character_transliterated + "a") + elif is_d_v(character_next, d_v): + token = character + input_text.seek(pos) + character_transliterated = transliterate(token, tamil_map) + if character_transliterated is not None: + output_text.write(character_transliterated) + + character = character_next + character_next = input_text.read(1) + +if __name__ == "__main__": + main() diff --git a/ssn_parser_new/scripts/vul.py b/ssn_parser_new/scripts/vul.py new file mode 100644 index 0000000000000000000000000000000000000000..18d1cfc8c725e5441a8ae331b6c52e6efcef526c --- /dev/null +++ b/ssn_parser_new/scripts/vul.py @@ -0,0 +1,179 @@ +import subprocess +import sys + +def cat(file_path): + with open(file_path, 'r') as file: + return file.read() + +def head(text, lines): + text_split = text.split('\n') + return '\n'.join(text_split[:lines]) + +def tail(text, lines): + text_split = text.split('\n') + return '\n'.join(text_split[-lines:]) + +def process_word_file(word_file_path): + if len(sys.argv) != 2: + print("arg --> word file") + exit(0) + + + char = cat(sys.argv[1])[1] + + f1 = 0 + with open("lists/alphabets", 'r') as alphabets_file: + f1 = sum(1 for line in alphabets_file if char in line.split()) + + if f1 == 0: + with open("lists/out_word", 'w') as out_word_handle: + + #subprocess.call(["scripts/tamil_trans", "lists/tamil_map", sys.argv[1], "lists/trans_word"]) + command = ["scripts/tamil_trans_py", "lists/tamil_map", sys.argv[1], "lists/trans_word"] + try: + #print("Executing command:", " ".join(command)) + return_code = subprocess.run(command) + #print("Return code:", return_code) + # if return_code == 0: + # print("Command executed successfully") + # else: + # print(f"Command failed with return code {return_code}") + except Exception as e: + print(f"An error occurred: {e}") + + + # import shutil + # import os + + # source_path = 'lists/trans_word' + # destination_path = '/home/mukesh/Desktop/' + + # # Ensure the source file exists before attempting to copy + # if os.path.exists(source_path): + # shutil.copy(source_path, destination_path) + # print(f"File copied to {destination_path}") + # else: + # print(f"The source file {source_path} does not exist.") + + #subprocess.call(["python", "scripts/ortho_to_phonetic1.py", "lists/trans_word", "lists/phone_list", "phn"]) + + # command1 = ["python", "scripts/ortho_to_phonetic1.py", "lists/trans_word", "lists/phone_list", "phn"] + # try: + # print("Executing command:", " ".join(command1)) + # return_code1 = subprocess.call(command1) + # print("Return code:", return_code1) + # if return_code1 == 0: + # print("Command executed successfully") + # else: + # print(f"Command failed with return code {return_code1}") + # except Exception as e: + # print(f"An error occurred: {e}") + + + try: + result = subprocess.run(["python", "scripts/ortho_to_phonetic1.py", "lists/trans_word", "lists/phone_list", "phn"], + capture_output=True, text=True, check=True) + + #print("Subprocess Output:", result.stdout) + except subprocess.CalledProcessError as e: + print("Subprocess Error:", e.stderr) + print("Return Code:", e.returncode) + + phn_lines = cat("phn").split('\n') + count = len(phn_lines) + start = 2 + phn = tail(head(cat("phn"), 1), 1) + + if phn == "c": + out_word_handle.write(" s") + else: + out_word_handle.write(f" {phn}") + + while start <= count: + phn = tail(head(cat("phn"), start), 1) + c0 = start - 1 + c1 = start + 1 + c2 = start + 2 + phn_1 = tail(head(cat("phn"), c0), 1) + phn_2 = tail(head(cat("phn"), c1), 1) + + if ( + (phn == "c" and phn_1 == "c") or + (phn == "c" and phn_2 == "c") or + (phn == "c" and phn_1 == "tx") + ): + out_word_handle.write(f" {phn}") + elif phn == "c" and phn_1 == "nj": + out_word_handle.write(" j") + elif phn == "c" and phn_1 != "c": + out_word_handle.write(" s") + elif (phn == "rx" and phn_2 == "rx"): + out_word_handle.write(" tx") + else: + temp_vuv_lines = cat("lists/vuv_list").split('\n') + with open("lists/tmp", 'w') as list_temp: + for line1 in temp_vuv_lines: + line_temp = line1.split() + if line_temp: # Check if line_temp is not empty + list_temp.write(line_temp[0] + '\n') + + temp_vuv = cat("lists/tmp").split('\n') + flg = sum(1 for line in temp_vuv if phn in line.split()) + phn0 = tail(head(cat("phn"), c0), 1) + phn1 = tail(head(cat("phn"), c1), 1) + phn2 = tail(head(cat("phn"), c2), 1) + + if phn == "u": + flg_1 = sum(1 for line in cat("lists/u_list").split('\n') if phn1 in line.split()) + if (start == count) or (flg_1 != 0 and c1 == count) or (phn1 == "k" and phn2 == "k" and c0 != 1): + out_word_handle.write(" eu") + else: + out_word_handle.write(f" {phn}") + elif phn == "c": + if phn0 == "c" or phn1 == "c" or c1 == count: + out_word_handle.write(f" {phn}") + elif phn0 == "nj": + out_word_handle.write(" j") + else: + out_word_handle.write(" s") + elif flg == 1: + temp_lines = cat("lists/vuv_list").split('\n') + with open("lists/nasal", 'w') as list_nasal: + for line_nasal in temp_lines: + line_temp = line_nasal.split() + if len(line_temp) >= 3: # Check if line_temp has at least three elements + list_nasal.write(line_temp[2] + '\n') + + + flg1 = sum(1 for line in cat("lists/vowel_list").split('\n') if phn0 in line.split()) + flg2 = sum(1 for line in cat("lists/vowel_list").split('\n') if phn1 in line.split()) + flg3 = sum(1 for line in cat("lists/nasal").split('\n') if phn0 in line.split()) + flg4 = sum(1 for line in cat("lists/sv").split('\n') if phn0 in line.split()) + + if phn == "p": + if (flg1 == 1 and flg2 == 1) or (flg3 == 1 and phn0 != phn) or (phn0 == "n"): + phn_v_tmp = next(line.split() for line in cat("lists/vuv_list").split('\n') if phn in line.split()) + phn_v = phn_v_tmp[1] + out_word_handle.write(f" {phn_v}") + else: + out_word_handle.write(f" {phn}") + elif (flg1 == 1 and flg2 == 1) or (flg3 == 1 and phn0 != phn) or (flg4 == 1 and flg2 == 1): + temp_phn_v = next(line.split() for line in cat("lists/vuv_list").split('\n') if phn in line.split()) + phn_v = temp_phn_v[1] + out_word_handle.write(f" {phn_v}") + else: + out_word_handle.write(f" {phn}") + else: + out_word_handle.write(f" {phn}") + + start += 1 + #print() + +import os +import pdb +if __name__ == "__main__": + if len(sys.argv) != 2: + print("arg --> word file") + exit(0) + + process_word_file(sys.argv[1]) diff --git a/ssn_parser_new/tamil_parser.py b/ssn_parser_new/tamil_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..03e3784d33a3f17c82ccb386b069f24560de16c9 --- /dev/null +++ b/ssn_parser_new/tamil_parser.py @@ -0,0 +1,37 @@ +import shutil +import subprocess +import sys +import os + +def main(): + if len(sys.argv) != 5: + print("Usage: python script.py inp_file out_file rand_num ssn_parser_folder") + sys.exit(1) + + + inp_file = sys.argv[1] + out_file = sys.argv[2] + rand_num = sys.argv[3] + ssn_parser_folder = sys.argv[4] + + new_folder = f"{ssn_parser_folder}_{rand_num}" + + # Copy the ssn_parser_folder to a new folder + + shutil.copytree(ssn_parser_folder, new_folder) + + # Run the non_parallel-parser.py script + subprocess.run([ + "python", + os.path.join(new_folder, "non_parallel-parser.py"), + inp_file, + out_file, + new_folder, + rand_num + ]) + + # Remove the temporary folder + shutil.rmtree(new_folder) + +if __name__ == "__main__": + main() diff --git a/ssn_parser_new/temp b/ssn_parser_new/temp new file mode 100644 index 0000000000000000000000000000000000000000..6230839adbbf0c78dc2c6fb21befa0cac1c98b60 --- /dev/null +++ b/ssn_parser_new/temp @@ -0,0 +1,9 @@ +அ a +அஇஅதிமுக ऐadimuga +அஃ af +அஃகமும் afkamum +அஃகம் afkam +அஃகாமை afkAmऐ +அஃகி afki +அஃதங்குக் aftaङgउk +அஃதடுங்கால் aftaडuङgAl diff --git a/ssn_parser_new/word b/ssn_parser_new/word new file mode 100644 index 0000000000000000000000000000000000000000..059083fb62beda2dc0da153b6f416bbde3ef14e1 --- /dev/null +++ b/ssn_parser_new/word @@ -0,0 +1 @@ +வெறுங்காலுடன் diff --git a/ssn_parser_new/words b/ssn_parser_new/words new file mode 100644 index 0000000000000000000000000000000000000000..77f778600dbfad377704dfba12a4adbfc6431e7a --- /dev/null +++ b/ssn_parser_new/words @@ -0,0 +1,10 @@ +௨ +அ +அஇஅதிமுக +அஃ +அஃகமும் +அஃகம் +அஃகாமை +அஃகி +அஃதங்குக் +அஃதடுங்கால்