Thilak118 commited on
Commit
51cff32
·
verified ·
1 Parent(s): 022dc69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -13
app.py CHANGED
@@ -4,6 +4,9 @@ import re
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
  from indic_transliteration.sanscript import transliterate, ITRANS, TAMIL
6
 
 
 
 
7
  MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_tamil"
8
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
@@ -13,25 +16,41 @@ model.eval()
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  model.to(device)
15
 
 
 
 
16
  def transliterate_to_tamil(text):
17
  if text and text.strip():
18
- return transliterate(text, ITRANS, TAMIL)
 
 
 
19
  return ""
20
 
 
 
 
21
  def clean_text(text):
22
- text = re.sub(r'[^\u0B80-\u0BFF\s.,!?]', '', text)
23
  text = re.sub(r'\s+', ' ', text).strip()
24
  return text
25
 
 
 
 
26
  def predict_toxicity(input_text):
27
  ta_text = transliterate_to_tamil(input_text)
 
 
 
 
28
  cleaned_text = clean_text(ta_text)
29
 
30
  inputs = tokenizer(
31
  cleaned_text,
32
  return_tensors="pt",
33
- padding=True,
34
  truncation=True,
 
35
  max_length=128
36
  ).to(device)
37
 
@@ -50,23 +69,52 @@ def predict_toxicity(input_text):
50
  f"Confidence: {confidence:.2f}%"
51
  )
52
 
53
- with gr.Blocks(title="Tamil Toxicity Classifier") as demo:
 
 
 
54
  gr.Markdown(
55
  """
56
  # Tamil Text Toxicity Classifier 🇮🇳
57
- Enter **English transliteration**
58
- Example: `nee romba mosam`
59
  """
60
  )
61
 
62
- input_text = gr.Textbox(label="Enter Text (English)", lines=2)
63
- preview = gr.Textbox(label="Tamil Text", interactive=False)
64
- output = gr.Textbox(label="Prediction", lines=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- preview_btn = gr.Button("Preview Tamil Text")
67
- predict_btn = gr.Button("Predict Toxicity")
 
 
 
68
 
69
- preview_btn.click(transliterate_to_tamil, input_text, preview)
70
- predict_btn.click(predict_toxicity, input_text, output)
 
 
 
71
 
72
  demo.launch()
 
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
  from indic_transliteration.sanscript import transliterate, ITRANS, TAMIL
6
 
7
+ # -----------------------------
8
+ # Load Model & Tokenizer
9
+ # -----------------------------
10
  MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_tamil"
11
 
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
 
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  model.to(device)
18
 
19
+ # -----------------------------
20
+ # Tamil Transliteration (Tanglish → Tamil)
21
+ # -----------------------------
22
  def transliterate_to_tamil(text):
23
  if text and text.strip():
24
+ try:
25
+ return transliterate(text, ITRANS, TAMIL)
26
+ except Exception:
27
+ return "Transliteration failed"
28
  return ""
29
 
30
+ # -----------------------------
31
+ # Text Cleaning (Tamil)
32
+ # -----------------------------
33
  def clean_text(text):
34
+ text = re.sub(r'[^\u0B80-\u0BFFa-zA-Z0-9\s.,!?]', '', text)
35
  text = re.sub(r'\s+', ' ', text).strip()
36
  return text
37
 
38
+ # -----------------------------
39
+ # Prediction
40
+ # -----------------------------
41
  def predict_toxicity(input_text):
42
  ta_text = transliterate_to_tamil(input_text)
43
+
44
+ if "failed" in ta_text.lower():
45
+ return f"Tamil Text: {ta_text}\nPrediction: Failed"
46
+
47
  cleaned_text = clean_text(ta_text)
48
 
49
  inputs = tokenizer(
50
  cleaned_text,
51
  return_tensors="pt",
 
52
  truncation=True,
53
+ padding=True,
54
  max_length=128
55
  ).to(device)
56
 
 
69
  f"Confidence: {confidence:.2f}%"
70
  )
71
 
72
+ # -----------------------------
73
+ # Gradio UI (Same as Malayalam)
74
+ # -----------------------------
75
+ with gr.Blocks(title="Tamil Text Toxicity Classifier") as demo:
76
  gr.Markdown(
77
  """
78
  # Tamil Text Toxicity Classifier 🇮🇳
79
+ Enter Tamil text in **English transliteration (Tanglish)**
80
+ Example: `nee romba mosamaanavan`
81
  """
82
  )
83
 
84
+ with gr.Row():
85
+ with gr.Column():
86
+ input_text = gr.Textbox(
87
+ label="Enter Text (English Transliteration)",
88
+ placeholder="nee romba mosamaanavan",
89
+ lines=2
90
+ )
91
+ with gr.Column():
92
+ preview_text = gr.Textbox(
93
+ label="Tamil Text",
94
+ interactive=False,
95
+ lines=2
96
+ )
97
+
98
+ with gr.Row():
99
+ preview_btn = gr.Button("Preview Transliteration")
100
+ predict_btn = gr.Button("Predict Toxicity")
101
+
102
+ output_text = gr.Textbox(
103
+ label="Prediction Output",
104
+ interactive=False,
105
+ lines=5
106
+ )
107
 
108
+ preview_btn.click(
109
+ fn=transliterate_to_tamil,
110
+ inputs=input_text,
111
+ outputs=preview_text
112
+ )
113
 
114
+ predict_btn.click(
115
+ fn=predict_toxicity,
116
+ inputs=input_text,
117
+ outputs=output_text
118
+ )
119
 
120
  demo.launch()