Thilak118 commited on
Commit
022dc69
·
verified ·
1 Parent(s): 47ad443

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -93
app.py CHANGED
@@ -1,75 +1,40 @@
1
  import gradio as gr
2
  import torch
3
  import re
4
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
- from deep_translator import GoogleTranslator
6
 
7
- # -----------------------------
8
- # Load model & tokenizer
9
- # -----------------------------
10
- MODEL_NAME = "Thilak118/indic-bert-toxicity-classifier_tamil"
11
 
12
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 
14
 
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  model.to(device)
17
- model.eval()
18
 
19
- # -----------------------------
20
- # Translator (English → Tamil)
21
- # -----------------------------
22
- translator = GoogleTranslator(source="auto", target="ta")
23
-
24
- # -----------------------------
25
- # Utility functions
26
- # -----------------------------
27
- def is_tamil_text(text):
28
- return bool(re.search(r"[\u0B80-\u0BFF]", text))
29
 
30
  def clean_text(text):
31
- text = re.sub(r"[^\u0B80-\u0BFF\s.,!?]", "", text)
32
- text = re.sub(r"\s+", " ", text).strip()
33
  return text
34
 
35
- def transliterate_to_tamil(text):
36
- try:
37
- return translator.translate(text)
38
- except Exception:
39
- return "❌ Transliteration failed"
40
-
41
- # -----------------------------
42
- # Prediction function
43
- # -----------------------------
44
- def predict_toxicity(user_input):
45
- if not user_input or not user_input.strip():
46
- return "❌ Please enter some text"
47
-
48
- # Step 1: Convert to Tamil if needed
49
- if is_tamil_text(user_input):
50
- tamil_text = user_input
51
- else:
52
- tamil_text = transliterate_to_tamil(user_input)
53
- if "failed" in tamil_text.lower():
54
- return tamil_text
55
-
56
- # Step 2: Clean Tamil text
57
- cleaned_text = clean_text(tamil_text)
58
-
59
- if not cleaned_text:
60
- return "❌ Invalid Tamil text after cleaning"
61
-
62
- # Step 3: Tokenize
63
  inputs = tokenizer(
64
  cleaned_text,
65
  return_tensors="pt",
66
  padding=True,
67
  truncation=True,
68
  max_length=128
69
- )
70
- inputs = {k: v.to(device) for k, v in inputs.items()}
71
 
72
- # Step 4: Inference
73
  with torch.no_grad():
74
  outputs = model(**inputs)
75
 
@@ -80,54 +45,28 @@ def predict_toxicity(user_input):
80
  label = "Toxic" if prediction == 0 else "Non-Toxic"
81
 
82
  return (
83
- f"📝 Tamil Text: {cleaned_text}\n"
84
- f"🔍 Prediction: {label}\n"
85
- f"📊 Confidence: {confidence:.2f}%"
86
  )
87
 
88
- # -----------------------------
89
- # Gradio UI
90
- # -----------------------------
91
- with gr.Blocks(title="Tamil Toxicity Classifier") as app:
92
  gr.Markdown(
93
  """
94
- ## Tamil Text Toxicity Classifier 🇮🇳
95
- Enter **Tamil text** or **English transliteration**
96
- (example: `nee romba mosamaanavan` → நீ ரொம்ப மோசமானவன்)
97
  """
98
  )
99
 
100
- with gr.Row():
101
- input_text = gr.Textbox(
102
- label="Input Text",
103
- placeholder="e.g., nee romba mosamaanavan",
104
- lines=2
105
- )
106
- preview_text = gr.Textbox(
107
- label="Tamil Preview",
108
- interactive=False,
109
- lines=2
110
- )
111
-
112
- preview_btn = gr.Button("Preview Transliteration")
113
- predict_btn = gr.Button("Predict Toxicity")
114
-
115
- output = gr.Textbox(
116
- label="Result",
117
- interactive=False,
118
- lines=5
119
- )
120
 
121
- preview_btn.click(
122
- fn=transliterate_to_tamil,
123
- inputs=input_text,
124
- outputs=preview_text
125
- )
126
 
127
- predict_btn.click(
128
- fn=predict_toxicity,
129
- inputs=input_text,
130
- outputs=output
131
- )
132
 
133
- app.launch()
 
1
  import gradio as gr
2
  import torch
3
  import re
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ from indic_transliteration.sanscript import transliterate, ITRANS, TAMIL
6
 
7
+ MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_tamil"
 
 
 
8
 
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
10
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
11
+ model.eval()
12
 
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  model.to(device)
 
15
 
16
+ def transliterate_to_tamil(text):
17
+ if text and text.strip():
18
+ return transliterate(text, ITRANS, TAMIL)
19
+ return ""
 
 
 
 
 
 
20
 
21
  def clean_text(text):
22
+ text = re.sub(r'[^\u0B80-\u0BFF\s.,!?]', '', text)
23
+ text = re.sub(r'\s+', ' ', text).strip()
24
  return text
25
 
26
+ def predict_toxicity(input_text):
27
+ ta_text = transliterate_to_tamil(input_text)
28
+ cleaned_text = clean_text(ta_text)
29
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  inputs = tokenizer(
31
  cleaned_text,
32
  return_tensors="pt",
33
  padding=True,
34
  truncation=True,
35
  max_length=128
36
+ ).to(device)
 
37
 
 
38
  with torch.no_grad():
39
  outputs = model(**inputs)
40
 
 
45
  label = "Toxic" if prediction == 0 else "Non-Toxic"
46
 
47
  return (
48
+ f"Tamil Text: {cleaned_text}\n"
49
+ f"Prediction: {label}\n"
50
+ f"Confidence: {confidence:.2f}%"
51
  )
52
 
53
+ with gr.Blocks(title="Tamil Toxicity Classifier") as demo:
 
 
 
54
  gr.Markdown(
55
  """
56
+ # Tamil Text Toxicity Classifier 🇮🇳
57
+ Enter **English transliteration**
58
+ Example: `nee romba mosam`
59
  """
60
  )
61
 
62
+ input_text = gr.Textbox(label="Enter Text (English)", lines=2)
63
+ preview = gr.Textbox(label="Tamil Text", interactive=False)
64
+ output = gr.Textbox(label="Prediction", lines=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ preview_btn = gr.Button("Preview Tamil Text")
67
+ predict_btn = gr.Button("Predict Toxicity")
 
 
 
68
 
69
+ preview_btn.click(transliterate_to_tamil, input_text, preview)
70
+ predict_btn.click(predict_toxicity, input_text, output)
 
 
 
71
 
72
+ demo.launch()