File size: 6,212 Bytes
8a02978 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#!/usr/bin/env python3
import argparse
import os
import sys
import re
from hindi_xlit import HindiTransliterator
from .classifier import load_model, classify_text
from .transliteration import load_dictionary, get_transliteration
from .utils import _print_predictions, _write_predictions, BASE_DIR
def main():
parser = argparse.ArgumentParser(description='Test l3cube-pune/hing-bert-lid on text (token-level).')
parser.add_argument('--device', type=str, default=None)
parser.add_argument('--text', type=str, default=None)
parser.add_argument('--threshold', type=float, default=0.80)
parser.add_argument('--dictionary', type=str, default=None)
args = parser.parse_args()
tokenizer, model, device = load_model(args.device)
dictionary = load_dictionary(args.dictionary)
transliterator = HindiTransliterator()
hindi_words = set()
if args.text:
preds = classify_text(args.text, tokenizer, model, device, args.threshold)
_print_predictions(preds)
_write_predictions(preds, args.text)
hindi_words.update(pred.token for pred in preds if pred.label == 'HI')
print("\nHindi words found:", ", ".join(hindi_words) if hindi_words else "None")
return
print("Interactive mode. Type text lines (QUIT to exit).")
all_input = []
try:
for line in sys.stdin:
line = line.rstrip('\n')
if not line:
continue
if line.strip().upper() == 'QUIT':
break
all_input.append(line)
preds = classify_text(line, tokenizer, model, device, args.threshold)
_print_predictions(preds)
_write_predictions(preds, line)
hindi_words.update(pred.token for pred in preds if pred.label == 'HI')
print()
except (KeyboardInterrupt, EOFError):
pass
finally:
if hindi_words:
print("\nAll Hindi words found:", ", ".join(sorted(hindi_words)))
print("\nTransliterated to Devanagari:")
hindi_to_devanagari = {}
for word in sorted(hindi_words):
try:
devanagari = get_transliteration(word, dictionary, transliterator)
hindi_to_devanagari[word] = devanagari
src = " (dictionary)" if word.lower() in dictionary else " (model)"
print(f"{word} -> {devanagari}{src}")
except Exception as e:
print(f"{word} -> [Error: {str(e)}]")
output_file = os.path.join(BASE_DIR, 'final_output.txt')
with open(output_file, 'w', encoding='utf-8') as f:
f.write("=== Original Text ===\n")
f.write('\n'.join(all_input) + '\n\n')
f.write("=== Reconstructed Text with Devanagari ===\n")
for line in all_input:
reconstructed = line
for word, dev in hindi_to_devanagari.items():
reconstructed = re.sub(
rf'(?<![\w\-])({re.escape(word)})(?![\w\-])',
dev,
reconstructed,
flags=re.IGNORECASE
)
f.write(reconstructed + '\n')
f.write("\n=== Hindi Words and Transliterations ===\n")
for word, dev in sorted(hindi_to_devanagari.items()):
f.write(f"{word} -> {dev}\n")
print(f"\nOutput saved to: {output_file}")
else:
print("\nNo Hindi words found.")
if __name__ == '__main__':
if sys.platform == 'win32':
try:
sys.stdout.reconfigure(encoding='utf-8')
except (AttributeError, TypeError):
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
main()
def process_text(text, device=None, threshold=0.80, dictionary_path=None):
"""
Processes input text and returns a dictionary with classification and transliteration results.
Args:
text (str): The input text to analyze.
device (str, optional): The device to use (cpu or cuda). Defaults to None.
threshold (float, optional): Confidence threshold for classifying Hindi tokens. Defaults to 0.80.
dictionary_path (str, optional): Path to custom transliteration dictionary. Defaults to None.
Returns:
dict: {
"hindi_words": set of detected Hindi tokens,
"transliterations": dict of {word: devanagari},
"final_text": reconstructed text with Hindi words replaced by Devanagari,
"output_file": path to saved final text file
}
"""
tokenizer, model, device = load_model(device)
dictionary = load_dictionary(dictionary_path)
transliterator = HindiTransliterator()
preds = classify_text(text, tokenizer, model, device, threshold)
hindi_words = set(pred.token for pred in preds if pred.label == 'HI')
if not hindi_words:
return {
"hindi_words": set(),
"transliterations": {},
"final_text": text,
"output_file": None
}
hindi_to_devanagari = {}
for word in sorted(hindi_words):
try:
dev = get_transliteration(word, dictionary, transliterator)
hindi_to_devanagari[word] = dev
except Exception:
hindi_to_devanagari[word] = word # fallback
reconstructed = text
for word, dev in hindi_to_devanagari.items():
reconstructed = re.sub(
rf'(?<![\w\-])({re.escape(word)})(?![\w\-])',
dev,
reconstructed,
flags=re.IGNORECASE
)
output_file = os.path.join(BASE_DIR, 'final_output.txt')
with open(output_file, 'w', encoding='utf-8') as f:
f.write("=== Original Text ===\n")
f.write(text + "\n\n")
f.write("=== Reconstructed Text ===\n")
f.write(reconstructed + "\n")
return {
"hindi_words": hindi_words,
"transliterations": hindi_to_devanagari,
"final_text": reconstructed,
"output_file": output_file
}
|