|
|
|
|
|
import argparse |
|
|
import os |
|
|
import sys |
|
|
import re |
|
|
from hindi_xlit import HindiTransliterator |
|
|
from .classifier import load_model, classify_text |
|
|
from .transliteration import load_dictionary, get_transliteration |
|
|
from .utils import _print_predictions, _write_predictions, BASE_DIR |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description='Test l3cube-pune/hing-bert-lid on text (token-level).') |
|
|
parser.add_argument('--device', type=str, default=None) |
|
|
parser.add_argument('--text', type=str, default=None) |
|
|
parser.add_argument('--threshold', type=float, default=0.80) |
|
|
parser.add_argument('--dictionary', type=str, default=None) |
|
|
args = parser.parse_args() |
|
|
|
|
|
tokenizer, model, device = load_model(args.device) |
|
|
dictionary = load_dictionary(args.dictionary) |
|
|
transliterator = HindiTransliterator() |
|
|
hindi_words = set() |
|
|
|
|
|
if args.text: |
|
|
preds = classify_text(args.text, tokenizer, model, device, args.threshold) |
|
|
_print_predictions(preds) |
|
|
_write_predictions(preds, args.text) |
|
|
hindi_words.update(pred.token for pred in preds if pred.label == 'HI') |
|
|
print("\nHindi words found:", ", ".join(hindi_words) if hindi_words else "None") |
|
|
return |
|
|
|
|
|
print("Interactive mode. Type text lines (QUIT to exit).") |
|
|
all_input = [] |
|
|
try: |
|
|
for line in sys.stdin: |
|
|
line = line.rstrip('\n') |
|
|
if not line: |
|
|
continue |
|
|
if line.strip().upper() == 'QUIT': |
|
|
break |
|
|
all_input.append(line) |
|
|
preds = classify_text(line, tokenizer, model, device, args.threshold) |
|
|
_print_predictions(preds) |
|
|
_write_predictions(preds, line) |
|
|
hindi_words.update(pred.token for pred in preds if pred.label == 'HI') |
|
|
print() |
|
|
except (KeyboardInterrupt, EOFError): |
|
|
pass |
|
|
finally: |
|
|
if hindi_words: |
|
|
print("\nAll Hindi words found:", ", ".join(sorted(hindi_words))) |
|
|
print("\nTransliterated to Devanagari:") |
|
|
hindi_to_devanagari = {} |
|
|
for word in sorted(hindi_words): |
|
|
try: |
|
|
devanagari = get_transliteration(word, dictionary, transliterator) |
|
|
hindi_to_devanagari[word] = devanagari |
|
|
src = " (dictionary)" if word.lower() in dictionary else " (model)" |
|
|
print(f"{word} -> {devanagari}{src}") |
|
|
except Exception as e: |
|
|
print(f"{word} -> [Error: {str(e)}]") |
|
|
|
|
|
output_file = os.path.join(BASE_DIR, 'final_output.txt') |
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
f.write("=== Original Text ===\n") |
|
|
f.write('\n'.join(all_input) + '\n\n') |
|
|
f.write("=== Reconstructed Text with Devanagari ===\n") |
|
|
for line in all_input: |
|
|
reconstructed = line |
|
|
for word, dev in hindi_to_devanagari.items(): |
|
|
reconstructed = re.sub( |
|
|
rf'(?<![\w\-])({re.escape(word)})(?![\w\-])', |
|
|
dev, |
|
|
reconstructed, |
|
|
flags=re.IGNORECASE |
|
|
) |
|
|
f.write(reconstructed + '\n') |
|
|
f.write("\n=== Hindi Words and Transliterations ===\n") |
|
|
for word, dev in sorted(hindi_to_devanagari.items()): |
|
|
f.write(f"{word} -> {dev}\n") |
|
|
print(f"\nOutput saved to: {output_file}") |
|
|
else: |
|
|
print("\nNo Hindi words found.") |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
if sys.platform == 'win32': |
|
|
try: |
|
|
sys.stdout.reconfigure(encoding='utf-8') |
|
|
except (AttributeError, TypeError): |
|
|
import io |
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') |
|
|
main() |
|
|
def process_text(text, device=None, threshold=0.80, dictionary_path=None): |
|
|
""" |
|
|
Processes input text and returns a dictionary with classification and transliteration results. |
|
|
|
|
|
Args: |
|
|
text (str): The input text to analyze. |
|
|
device (str, optional): The device to use (cpu or cuda). Defaults to None. |
|
|
threshold (float, optional): Confidence threshold for classifying Hindi tokens. Defaults to 0.80. |
|
|
dictionary_path (str, optional): Path to custom transliteration dictionary. Defaults to None. |
|
|
|
|
|
Returns: |
|
|
dict: { |
|
|
"hindi_words": set of detected Hindi tokens, |
|
|
"transliterations": dict of {word: devanagari}, |
|
|
"final_text": reconstructed text with Hindi words replaced by Devanagari, |
|
|
"output_file": path to saved final text file |
|
|
} |
|
|
""" |
|
|
tokenizer, model, device = load_model(device) |
|
|
dictionary = load_dictionary(dictionary_path) |
|
|
transliterator = HindiTransliterator() |
|
|
|
|
|
preds = classify_text(text, tokenizer, model, device, threshold) |
|
|
hindi_words = set(pred.token for pred in preds if pred.label == 'HI') |
|
|
|
|
|
if not hindi_words: |
|
|
return { |
|
|
"hindi_words": set(), |
|
|
"transliterations": {}, |
|
|
"final_text": text, |
|
|
"output_file": None |
|
|
} |
|
|
|
|
|
hindi_to_devanagari = {} |
|
|
for word in sorted(hindi_words): |
|
|
try: |
|
|
dev = get_transliteration(word, dictionary, transliterator) |
|
|
hindi_to_devanagari[word] = dev |
|
|
except Exception: |
|
|
hindi_to_devanagari[word] = word |
|
|
|
|
|
reconstructed = text |
|
|
for word, dev in hindi_to_devanagari.items(): |
|
|
reconstructed = re.sub( |
|
|
rf'(?<![\w\-])({re.escape(word)})(?![\w\-])', |
|
|
dev, |
|
|
reconstructed, |
|
|
flags=re.IGNORECASE |
|
|
) |
|
|
|
|
|
output_file = os.path.join(BASE_DIR, 'final_output.txt') |
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
f.write("=== Original Text ===\n") |
|
|
f.write(text + "\n\n") |
|
|
f.write("=== Reconstructed Text ===\n") |
|
|
f.write(reconstructed + "\n") |
|
|
|
|
|
return { |
|
|
"hindi_words": hindi_words, |
|
|
"transliterations": hindi_to_devanagari, |
|
|
"final_text": reconstructed, |
|
|
"output_file": output_file |
|
|
} |
|
|
|