#!/usr/bin/env python3 import argparse import os import sys import re from hindi_xlit import HindiTransliterator from .classifier import load_model, classify_text from .transliteration import load_dictionary, get_transliteration from .utils import _print_predictions, _write_predictions, BASE_DIR def main(): parser = argparse.ArgumentParser(description='Test l3cube-pune/hing-bert-lid on text (token-level).') parser.add_argument('--device', type=str, default=None) parser.add_argument('--text', type=str, default=None) parser.add_argument('--threshold', type=float, default=0.80) parser.add_argument('--dictionary', type=str, default=None) args = parser.parse_args() tokenizer, model, device = load_model(args.device) dictionary = load_dictionary(args.dictionary) transliterator = HindiTransliterator() hindi_words = set() if args.text: preds = classify_text(args.text, tokenizer, model, device, args.threshold) _print_predictions(preds) _write_predictions(preds, args.text) hindi_words.update(pred.token for pred in preds if pred.label == 'HI') print("\nHindi words found:", ", ".join(hindi_words) if hindi_words else "None") return print("Interactive mode. Type text lines (QUIT to exit).") all_input = [] try: for line in sys.stdin: line = line.rstrip('\n') if not line: continue if line.strip().upper() == 'QUIT': break all_input.append(line) preds = classify_text(line, tokenizer, model, device, args.threshold) _print_predictions(preds) _write_predictions(preds, line) hindi_words.update(pred.token for pred in preds if pred.label == 'HI') print() except (KeyboardInterrupt, EOFError): pass finally: if hindi_words: print("\nAll Hindi words found:", ", ".join(sorted(hindi_words))) print("\nTransliterated to Devanagari:") hindi_to_devanagari = {} for word in sorted(hindi_words): try: devanagari = get_transliteration(word, dictionary, transliterator) hindi_to_devanagari[word] = devanagari src = " (dictionary)" if word.lower() in dictionary else " (model)" print(f"{word} -> {devanagari}{src}") except Exception as e: print(f"{word} -> [Error: {str(e)}]") output_file = os.path.join(BASE_DIR, 'final_output.txt') with open(output_file, 'w', encoding='utf-8') as f: f.write("=== Original Text ===\n") f.write('\n'.join(all_input) + '\n\n') f.write("=== Reconstructed Text with Devanagari ===\n") for line in all_input: reconstructed = line for word, dev in hindi_to_devanagari.items(): reconstructed = re.sub( rf'(? {dev}\n") print(f"\nOutput saved to: {output_file}") else: print("\nNo Hindi words found.") if __name__ == '__main__': if sys.platform == 'win32': try: sys.stdout.reconfigure(encoding='utf-8') except (AttributeError, TypeError): import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') main() def process_text(text, device=None, threshold=0.80, dictionary_path=None): """ Processes input text and returns a dictionary with classification and transliteration results. Args: text (str): The input text to analyze. device (str, optional): The device to use (cpu or cuda). Defaults to None. threshold (float, optional): Confidence threshold for classifying Hindi tokens. Defaults to 0.80. dictionary_path (str, optional): Path to custom transliteration dictionary. Defaults to None. Returns: dict: { "hindi_words": set of detected Hindi tokens, "transliterations": dict of {word: devanagari}, "final_text": reconstructed text with Hindi words replaced by Devanagari, "output_file": path to saved final text file } """ tokenizer, model, device = load_model(device) dictionary = load_dictionary(dictionary_path) transliterator = HindiTransliterator() preds = classify_text(text, tokenizer, model, device, threshold) hindi_words = set(pred.token for pred in preds if pred.label == 'HI') if not hindi_words: return { "hindi_words": set(), "transliterations": {}, "final_text": text, "output_file": None } hindi_to_devanagari = {} for word in sorted(hindi_words): try: dev = get_transliteration(word, dictionary, transliterator) hindi_to_devanagari[word] = dev except Exception: hindi_to_devanagari[word] = word # fallback reconstructed = text for word, dev in hindi_to_devanagari.items(): reconstructed = re.sub( rf'(?