kasimali commited on
Commit
423591d
·
verified ·
1 Parent(s): ef2efca

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +5 -8
  2. UPLOAD_INSTRUCTIONS.txt +20 -0
  3. app.py +110 -0
  4. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,9 @@
1
  ---
2
- title: Indictrans2
3
- emoji: 👁
4
- colorFrom: indigo
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: INDICTRANS2
3
+ emoji: 🚀
 
 
4
  sdk: gradio
 
 
 
5
  ---
6
 
7
+ # INDICTRANS2
8
+
9
+ Gradio application
UPLOAD_INSTRUCTIONS.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Upload this Space to Hugging Face
2
+ # Run this in a new Colab cell tomorrow:
3
+
4
+ from huggingface_hub import HfApi, create_repo, login
5
+
6
+ login()
7
+ api = HfApi()
8
+
9
+ USERNAME = "kasimali"
10
+ SPACE_NAME = "indictrans2"
11
+
12
+ create_repo(repo_id=f"{USERNAME}/{SPACE_NAME}", repo_type="space", space_sdk="gradio", exist_ok=True)
13
+
14
+ api.upload_folder(
15
+ folder_path="./indictrans2",
16
+ repo_id=f"{USERNAME}/{SPACE_NAME}",
17
+ repo_type="space"
18
+ )
19
+
20
+ print(f"Uploaded: https://huggingface.co/spaces/{USERNAME}/{SPACE_NAME}")
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # INDICTRANS2
2
+
3
+ # --- 1. CLEAN UP AND PREPARE THE ENVIRONMENT (CORRECTLY) ---
4
+ print("Cleaning up and preparing the environment...")
5
+ # This command removes the old directory if it exists, preventing the 'already exists' error.
6
+ print("✅ Environment ready.")
7
+
8
+ # --- 2. INSTALL ALL REQUIRED LIBRARIES FROM PyPI (USING A STABLE TRANSLITERATOR) ---
9
+ print("Installing all required libraries from PyPI...")
10
+ # Pinning transformers to a stable version to prevent caching errors.
11
+ # We are now using 'indic-transliteration' which is stable and maintained.
12
+ print("✅ All libraries installed successfully.")
13
+
14
+ # --- 3. SET UP THE SYSTEM PATH FOR THE TRANSLATION TOOLKIT (THE ONLY CORRECT METHOD) ---
15
+ import sys
16
+ # This tells Python where to find the IndicTransToolkit module without installation.
17
+ sys.path.insert(0, '/content/IndicTrans2/src')
18
+ print("✅ IndicTransToolkit added to system path.")
19
+
20
+ # --- 4. IMPORT ALL PACKAGES ---
21
+ import gradio as gr
22
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
23
+ from IndicTransToolkit.processor import IndicProcessor
24
+ from indic_transliteration import sanscript
25
+ from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
26
+ import torch
27
+ print("✅ All packages imported.")
28
+
29
+ # --- 5. LOAD BOTH MODELS (TRANSLATION AND TRANSLITERATION) ---
30
+ print("Loading models and components...")
31
+ device = torch.device("cpu")
32
+ # A. Translation Model
33
+ translator_model_name = "ai4bharat/indictrans2-indic-en-dist-200M"
34
+ translator_tokenizer = AutoTokenizer.from_pretrained(translator_model_name, trust_remote_code=True)
35
+ translator_model = AutoModelForSeq2SeqLM.from_pretrained(translator_model_name, trust_remote_code=True).to(device)
36
+ ip = IndicProcessor(inference=True)
37
+ print("✅ Translation model and IndicProcessor are ready!")
38
+
39
+ # --- 6. DEFINE THE CORRECT, HIGH-ACCURACY TRANSLATION FUNCTIONS ---
40
+ LANG_CODES = {
41
+ "Hindi": {"xlit": sanscript.DEVANAGARI, "indictrans": "hin_Deva"},
42
+ "Tamil": {"xlit": sanscript.TAMIL, "indictrans": "tam_Taml"},
43
+ "Bengali": {"xlit": sanscript.BENGALI, "indictrans": "ben_Beng"},
44
+ "Telugu": {"xlit": sanscript.TELUGU, "indictrans": "tel_Telu"},
45
+ "Kannada": {"xlit": sanscript.KANNADA, "indictrans": "kan_Knda"},
46
+ "Malayalam": {"xlit": sanscript.MALAYALAM, "indictrans": "mal_Mlym"},
47
+ "Gujarati": {"xlit": sanscript.GUJARATI, "indictrans": "guj_Gujr"},
48
+ "Punjabi": {"xlit": sanscript.GURMUKHI, "indictrans": "pan_Guru"},
49
+ "Urdu": {"xlit": sanscript.URDU, "indictrans": "urd_Arab"}
50
+ }
51
+ # Marathi uses Devanagari script for transliteration
52
+ LANG_CODES["Marathi"] = {"xlit": sanscript.DEVANAGARI, "indictrans": "mar_Deva"}
53
+
54
+
55
+ def translate_native_script(native_text, source_language_name):
56
+ """Handles the direct native-to-English workflow."""
57
+ try:
58
+ if not native_text or not native_text.strip(): return "Please enter text."
59
+ src_lang = LANG_CODES[source_language_name]["indictrans"]
60
+ processed_text = ip.preprocess_batch([native_text], src_lang=src_lang, tgt_lang="eng_Latn")
61
+ inputs = translator_tokenizer(processed_text, return_tensors="pt", padding=True).to(device)
62
+ with torch.no_grad():
63
+ translated_tokens = translator_model.generate(**inputs, num_beams=5, max_length=256)
64
+ decoded_translation = translator_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
65
+ return ip.postprocess_batch(decoded_translation, lang=src_lang)[0]
66
+ except Exception as e:
67
+ return f"An error occurred: {str(e)}"
68
+
69
+ def translate_roman_script(roman_text, source_language_name):
70
+ """Performs the high-accuracy two-step transliterate-then-translate process."""
71
+ try:
72
+ if not roman_text or not roman_text.strip(): return "Please enter text."
73
+
74
+ # Step 1: Transliterate Roman to Native Script using the stable 'indic-transliteration' library
75
+ target_script = LANG_CODES[source_language_name]["xlit"]
76
+ native_text = transliterate(roman_text, sanscript.ITRANS, target_script)
77
+
78
+ # Step 2: Translate the resulting Native Script to English
79
+ return translate_native_script(native_text, source_language_name)
80
+
81
+ except Exception as e:
82
+ return f"An error occurred: {str(e)}"
83
+
84
+ print("✅ High-accuracy translation functions are ready.")
85
+
86
+ # --- 7. CREATE AND LAUNCH THE SEPARATE UI WITH TABS ---
87
+ with gr.Blocks() as demo:
88
+ gr.Markdown("## IndicTrans2: Universal Language Translator (Final Accurate Workflow)")
89
+ gr.Markdown("Translate from both native and romanized Indian languages to English using specialized, high-accuracy workflows.")
90
+
91
+ with gr.Tab("🇮🇳 Native Script to English"):
92
+ with gr.Row():
93
+ native_inputs = [
94
+ gr.Textbox(lines=5, label="Native Indian Language Text", placeholder="यहाँ अपना पाठ दर्ज करें..."),
95
+ gr.Dropdown(choices=list(LANG_CODES.keys()), label="Select Source Language", value="Hindi")
96
+ ]
97
+ native_output = gr.Textbox(label="English Translation")
98
+ gr.Button("Translate Native Text").click(fn=translate_native_script, inputs=native_inputs, outputs=native_output)
99
+
100
+ with gr.Tab("🔡 Romanized Script to English"):
101
+ with gr.Row():
102
+ roman_inputs = [
103
+ gr.Textbox(lines=5, label="Romanized Indian Language Text", placeholder="Aap kaise hain?"),
104
+ gr.Dropdown(choices=list(LANG_CODES.keys()), label="Select Source Language", value="Hindi")
105
+ ]
106
+ roman_output = gr.Textbox(label="English Translation")
107
+ gr.Button("Translate Romanized Text").click(fn=translate_roman_script, inputs=roman_inputs, outputs=roman_output)
108
+
109
+ print("🚀 Launching the final, robust, and correct Gradio app...")
110
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ IndicTransToolkit
2
+ gradio
3
+ indic-transliteration
4
+ torch
5
+ transformers