File size: 6,212 Bytes
8a02978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
import argparse
import os
import sys
import re
from hindi_xlit import HindiTransliterator
from .classifier import load_model, classify_text
from .transliteration import load_dictionary, get_transliteration
from .utils import _print_predictions, _write_predictions, BASE_DIR

def main():
    parser = argparse.ArgumentParser(description='Test l3cube-pune/hing-bert-lid on text (token-level).')
    parser.add_argument('--device', type=str, default=None)
    parser.add_argument('--text', type=str, default=None)
    parser.add_argument('--threshold', type=float, default=0.80)
    parser.add_argument('--dictionary', type=str, default=None)
    args = parser.parse_args()

    tokenizer, model, device = load_model(args.device)
    dictionary = load_dictionary(args.dictionary)
    transliterator = HindiTransliterator()
    hindi_words = set()

    if args.text:
        preds = classify_text(args.text, tokenizer, model, device, args.threshold)
        _print_predictions(preds)
        _write_predictions(preds, args.text)
        hindi_words.update(pred.token for pred in preds if pred.label == 'HI')
        print("\nHindi words found:", ", ".join(hindi_words) if hindi_words else "None")
        return

    print("Interactive mode. Type text lines (QUIT to exit).")
    all_input = []
    try:
        for line in sys.stdin:
            line = line.rstrip('\n')
            if not line:
                continue
            if line.strip().upper() == 'QUIT':
                break
            all_input.append(line)
            preds = classify_text(line, tokenizer, model, device, args.threshold)
            _print_predictions(preds)
            _write_predictions(preds, line)
            hindi_words.update(pred.token for pred in preds if pred.label == 'HI')
            print()
    except (KeyboardInterrupt, EOFError):
        pass
    finally:
        if hindi_words:
            print("\nAll Hindi words found:", ", ".join(sorted(hindi_words)))
            print("\nTransliterated to Devanagari:")
            hindi_to_devanagari = {}
            for word in sorted(hindi_words):
                try:
                    devanagari = get_transliteration(word, dictionary, transliterator)
                    hindi_to_devanagari[word] = devanagari
                    src = " (dictionary)" if word.lower() in dictionary else " (model)"
                    print(f"{word} -> {devanagari}{src}")
                except Exception as e:
                    print(f"{word} -> [Error: {str(e)}]")

            output_file = os.path.join(BASE_DIR, 'final_output.txt')
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write("=== Original Text ===\n")
                f.write('\n'.join(all_input) + '\n\n')
                f.write("=== Reconstructed Text with Devanagari ===\n")
                for line in all_input:
                    reconstructed = line
                    for word, dev in hindi_to_devanagari.items():
                        reconstructed = re.sub(
                            rf'(?<![\w\-])({re.escape(word)})(?![\w\-])',
                            dev,
                            reconstructed,
                            flags=re.IGNORECASE
                        )
                    f.write(reconstructed + '\n')
                f.write("\n=== Hindi Words and Transliterations ===\n")
                for word, dev in sorted(hindi_to_devanagari.items()):
                    f.write(f"{word} -> {dev}\n")
            print(f"\nOutput saved to: {output_file}")
        else:
            print("\nNo Hindi words found.")


if __name__ == '__main__':
    if sys.platform == 'win32':
        try:
            sys.stdout.reconfigure(encoding='utf-8')
        except (AttributeError, TypeError):
            import io
            sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    main()
def process_text(text, device=None, threshold=0.80, dictionary_path=None):
    """
    Processes input text and returns a dictionary with classification and transliteration results.

    Args:
        text (str): The input text to analyze.
        device (str, optional): The device to use (cpu or cuda). Defaults to None.
        threshold (float, optional): Confidence threshold for classifying Hindi tokens. Defaults to 0.80.
        dictionary_path (str, optional): Path to custom transliteration dictionary. Defaults to None.

    Returns:
        dict: {
            "hindi_words": set of detected Hindi tokens,
            "transliterations": dict of {word: devanagari},
            "final_text": reconstructed text with Hindi words replaced by Devanagari,
            "output_file": path to saved final text file
        }
    """
    tokenizer, model, device = load_model(device)
    dictionary = load_dictionary(dictionary_path)
    transliterator = HindiTransliterator()

    preds = classify_text(text, tokenizer, model, device, threshold)
    hindi_words = set(pred.token for pred in preds if pred.label == 'HI')

    if not hindi_words:
        return {
            "hindi_words": set(),
            "transliterations": {},
            "final_text": text,
            "output_file": None
        }

    hindi_to_devanagari = {}
    for word in sorted(hindi_words):
        try:
            dev = get_transliteration(word, dictionary, transliterator)
            hindi_to_devanagari[word] = dev
        except Exception:
            hindi_to_devanagari[word] = word  # fallback

    reconstructed = text
    for word, dev in hindi_to_devanagari.items():
        reconstructed = re.sub(
            rf'(?<![\w\-])({re.escape(word)})(?![\w\-])',
            dev,
            reconstructed,
            flags=re.IGNORECASE
        )

    output_file = os.path.join(BASE_DIR, 'final_output.txt')
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("=== Original Text ===\n")
        f.write(text + "\n\n")
        f.write("=== Reconstructed Text ===\n")
        f.write(reconstructed + "\n")

    return {
        "hindi_words": hindi_words,
        "transliterations": hindi_to_devanagari,
        "final_text": reconstructed,
        "output_file": output_file
    }