PraveenSharma08's picture
Initial project upload: Hindi/English Text-to-Speech pipeline
8a02978
#!/usr/bin/env python3
import argparse
import os
import sys
import re
from hindi_xlit import HindiTransliterator
from .classifier import load_model, classify_text
from .transliteration import load_dictionary, get_transliteration
from .utils import _print_predictions, _write_predictions, BASE_DIR
def main():
parser = argparse.ArgumentParser(description='Test l3cube-pune/hing-bert-lid on text (token-level).')
parser.add_argument('--device', type=str, default=None)
parser.add_argument('--text', type=str, default=None)
parser.add_argument('--threshold', type=float, default=0.80)
parser.add_argument('--dictionary', type=str, default=None)
args = parser.parse_args()
tokenizer, model, device = load_model(args.device)
dictionary = load_dictionary(args.dictionary)
transliterator = HindiTransliterator()
hindi_words = set()
if args.text:
preds = classify_text(args.text, tokenizer, model, device, args.threshold)
_print_predictions(preds)
_write_predictions(preds, args.text)
hindi_words.update(pred.token for pred in preds if pred.label == 'HI')
print("\nHindi words found:", ", ".join(hindi_words) if hindi_words else "None")
return
print("Interactive mode. Type text lines (QUIT to exit).")
all_input = []
try:
for line in sys.stdin:
line = line.rstrip('\n')
if not line:
continue
if line.strip().upper() == 'QUIT':
break
all_input.append(line)
preds = classify_text(line, tokenizer, model, device, args.threshold)
_print_predictions(preds)
_write_predictions(preds, line)
hindi_words.update(pred.token for pred in preds if pred.label == 'HI')
print()
except (KeyboardInterrupt, EOFError):
pass
finally:
if hindi_words:
print("\nAll Hindi words found:", ", ".join(sorted(hindi_words)))
print("\nTransliterated to Devanagari:")
hindi_to_devanagari = {}
for word in sorted(hindi_words):
try:
devanagari = get_transliteration(word, dictionary, transliterator)
hindi_to_devanagari[word] = devanagari
src = " (dictionary)" if word.lower() in dictionary else " (model)"
print(f"{word} -> {devanagari}{src}")
except Exception as e:
print(f"{word} -> [Error: {str(e)}]")
output_file = os.path.join(BASE_DIR, 'final_output.txt')
with open(output_file, 'w', encoding='utf-8') as f:
f.write("=== Original Text ===\n")
f.write('\n'.join(all_input) + '\n\n')
f.write("=== Reconstructed Text with Devanagari ===\n")
for line in all_input:
reconstructed = line
for word, dev in hindi_to_devanagari.items():
reconstructed = re.sub(
rf'(?<![\w\-])({re.escape(word)})(?![\w\-])',
dev,
reconstructed,
flags=re.IGNORECASE
)
f.write(reconstructed + '\n')
f.write("\n=== Hindi Words and Transliterations ===\n")
for word, dev in sorted(hindi_to_devanagari.items()):
f.write(f"{word} -> {dev}\n")
print(f"\nOutput saved to: {output_file}")
else:
print("\nNo Hindi words found.")
if __name__ == '__main__':
if sys.platform == 'win32':
try:
sys.stdout.reconfigure(encoding='utf-8')
except (AttributeError, TypeError):
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
main()
def process_text(text, device=None, threshold=0.80, dictionary_path=None):
"""
Processes input text and returns a dictionary with classification and transliteration results.
Args:
text (str): The input text to analyze.
device (str, optional): The device to use (cpu or cuda). Defaults to None.
threshold (float, optional): Confidence threshold for classifying Hindi tokens. Defaults to 0.80.
dictionary_path (str, optional): Path to custom transliteration dictionary. Defaults to None.
Returns:
dict: {
"hindi_words": set of detected Hindi tokens,
"transliterations": dict of {word: devanagari},
"final_text": reconstructed text with Hindi words replaced by Devanagari,
"output_file": path to saved final text file
}
"""
tokenizer, model, device = load_model(device)
dictionary = load_dictionary(dictionary_path)
transliterator = HindiTransliterator()
preds = classify_text(text, tokenizer, model, device, threshold)
hindi_words = set(pred.token for pred in preds if pred.label == 'HI')
if not hindi_words:
return {
"hindi_words": set(),
"transliterations": {},
"final_text": text,
"output_file": None
}
hindi_to_devanagari = {}
for word in sorted(hindi_words):
try:
dev = get_transliteration(word, dictionary, transliterator)
hindi_to_devanagari[word] = dev
except Exception:
hindi_to_devanagari[word] = word # fallback
reconstructed = text
for word, dev in hindi_to_devanagari.items():
reconstructed = re.sub(
rf'(?<![\w\-])({re.escape(word)})(?![\w\-])',
dev,
reconstructed,
flags=re.IGNORECASE
)
output_file = os.path.join(BASE_DIR, 'final_output.txt')
with open(output_file, 'w', encoding='utf-8') as f:
f.write("=== Original Text ===\n")
f.write(text + "\n\n")
f.write("=== Reconstructed Text ===\n")
f.write(reconstructed + "\n")
return {
"hindi_words": hindi_words,
"transliterations": hindi_to_devanagari,
"final_text": reconstructed,
"output_file": output_file
}