Mahmoud-Dev commited on
Commit
1e900c3
·
verified ·
1 Parent(s): 4b0008c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -33
app.py CHANGED
@@ -1,29 +1,36 @@
1
  import gradio as gr
2
  import torch
3
  from datasets import load_dataset
4
- from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
5
  import numpy as np
6
 
7
- # Load the sentiment dataset
8
- dataset = load_dataset('k1tub/sentiment_dataset')
9
- print(f"Dataset loaded with {len(dataset['train'])} training examples")
 
 
 
 
10
 
11
- # Load tokenizer and model
12
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
13
- model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
14
 
15
  def preprocess_function(examples):
16
- # Tokenize the text
17
  encoding = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
18
  # Map label to indices
19
- encoding['labels'] = examples['label']
 
 
 
20
  return encoding
21
 
22
  # Preprocess the dataset
23
  tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text'])
24
 
25
  def train_model(epochs, batch_size, learning_rate):
26
- """Fine-tune DistilBERT on the sentiment dataset"""
27
  try:
28
  training_args = TrainingArguments(
29
  output_dir='./results',
@@ -40,43 +47,43 @@ def train_model(epochs, batch_size, learning_rate):
40
  model=model,
41
  args=training_args,
42
  train_dataset=tokenized_dataset['train'],
43
- eval_dataset=tokenized_dataset['validation'] if 'validation' in tokenized_dataset else tokenized_dataset['train'],
44
  )
45
 
46
  # Start training
47
  trainer.train()
48
 
49
- return "\u270d✅ Training completed successfully!\n" + \
50
- f"Model saved to ./results\nFinal learning rate: {learning_rate}\nEpochs: {epochs}\nBatch size: {batch_size}"
51
  except Exception as e:
52
- return f"❌ Error during training: {str(e)}"
53
 
54
  # Create Gradio interface
55
- with gr.Blocks(title="DistilBERT Sentiment Training") as demo:
56
  gr.Markdown("""
57
- # 🚀 DistilBERT Sentiment Analysis Training
58
 
59
- Fine-tune **DistilBERT** model on the **k1tub/sentiment_dataset** (290k examples)
60
 
61
- ### Model Info:
62
- - **Base Model**: distilbert-base-uncased (67M parameters)
63
- - **Task**: Text Classification (Sentiment Analysis)
64
- - **Dataset**: k1tub/sentiment_dataset
65
- - **Framework**: Hugging Face Transformers
66
- """)
67
 
68
  with gr.Row():
69
  with gr.Column():
70
- gr.Markdown("### Training Configuration")
71
- epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
72
  batch_size = gr.Slider(minimum=8, maximum=64, value=32, step=8, label="Batch Size")
73
  learning_rate = gr.Slider(minimum=1e-5, maximum=1e-3, value=2e-5, step=1e-5, label="Learning Rate")
74
 
75
  with gr.Column():
76
- gr.Markdown("### Training Status")
77
- output_text = gr.Textbox(label="Output", lines=10, interactive=False)
78
 
79
- train_button = gr.Button("🔥 Start Training", variant="primary", scale=2)
80
  train_button.click(
81
  fn=train_model,
82
  inputs=[epochs, batch_size, learning_rate],
@@ -84,11 +91,11 @@ with gr.Blocks(title="DistilBERT Sentiment Training") as demo:
84
  )
85
 
86
  gr.Markdown("""
87
- ### Training Details:
88
- - **Free Hardware**: CPU Basic on Hugging Face Spaces
89
- - **Training Time**: Depends on dataset size and hardware
90
- - **Model Output**: Saved to ./results folder
91
- - **Inference**: Can be deployed as a separate Space
92
  """)
93
 
94
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import torch
3
  from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  import numpy as np
6
 
7
+ # Load the Arabic sentiment dataset (Saudi dialect from Twitter)
8
+ try:
9
+ dataset = load_dataset('arbml/Arabic_Sentiment_Twitter_Corpus')
10
+ print(f"Dataset loaded with {len(dataset['train'])} training examples")
11
+ except:
12
+ print("Loading alternative Arabic dataset...")
13
+ dataset = load_dataset('asas-ai/Arabic_Sentiment_Twitter_Corpus')
14
 
15
+ # Load tokenizer and model (supports Arabic)
16
+ tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')
17
+ model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=3)
18
 
19
  def preprocess_function(examples):
20
+ # Tokenize the Arabic text
21
  encoding = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
22
  # Map label to indices
23
+ if 'label' in examples:
24
+ encoding['labels'] = examples['label']
25
+ elif 'sentiment' in examples:
26
+ encoding['labels'] = examples['sentiment']
27
  return encoding
28
 
29
  # Preprocess the dataset
30
  tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text'])
31
 
32
  def train_model(epochs, batch_size, learning_rate):
33
+ """Fine-tune DistilBERT on Arabic sentiment dataset (Saudi dialect)"""
34
  try:
35
  training_args = TrainingArguments(
36
  output_dir='./results',
 
47
  model=model,
48
  args=training_args,
49
  train_dataset=tokenized_dataset['train'],
50
+ eval_dataset=tokenized_dataset.get('validation', tokenized_dataset['train']),
51
  )
52
 
53
  # Start training
54
  trainer.train()
55
 
56
+ return "\u270d✅ \u062aم التدريب بنجاح!\n" + \
57
+ f"النموذج محفوظ في ./results\nمعدل التعلم: {learning_rate}\nعدد الحقب: {epochs}\nBatch Size: {batch_size}"
58
  except Exception as e:
59
+ return f"❌ خطأ أثناء التدريب: {str(e)}"
60
 
61
  # Create Gradio interface
62
+ with gr.Blocks(title="DistilBERT Arabic Sentiment Training") as demo:
63
  gr.Markdown("""
64
+ # 🚀 تدريب نموذج DistilBERT العربي
65
 
66
+ ضبط نموذج **DistilBERT** على تحليل المشاعر باللغة العربية (اللهجة السعودية)
67
 
68
+ ### معلومات النموذج:
69
+ - **النموذج الأساسي**: distilbert-base-multilingual-cased (67M معامل)
70
+ - **المهمة**: تصنيف النصوص (المتعد اللغات)
71
+ - **قاعدة البيانات**: arbml/Arabic_Sentiment_Twitter_Corpus (58.8k مثال)
72
+ - **اللغة**: العربية (اللهجة السعودية والخليجية)
73
+ """)
74
 
75
  with gr.Row():
76
  with gr.Column():
77
+ gr.Markdown("### إعدادات التدريب")
78
+ epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="\u0639دد الحقب (Epochs)")
79
  batch_size = gr.Slider(minimum=8, maximum=64, value=32, step=8, label="Batch Size")
80
  learning_rate = gr.Slider(minimum=1e-5, maximum=1e-3, value=2e-5, step=1e-5, label="Learning Rate")
81
 
82
  with gr.Column():
83
+ gr.Markdown("### حالة التدريب")
84
+ output_text = gr.Textbox(label="المخرجات", lines=10, interactive=False)
85
 
86
+ train_button = gr.Button("🔥 بدء التدريب", variant="primary", scale=2)
87
  train_button.click(
88
  fn=train_model,
89
  inputs=[epochs, batch_size, learning_rate],
 
91
  )
92
 
93
  gr.Markdown("""
94
+ ### تفاصيل التدريب:
95
+ - **مرحلة البناء**: GPU مجاني (مباشر عبر Hugging Face Spaces)
96
+ - **وقت المتوقع**: 5-10 دقائق (GPU) أو 15-20 دقيقة (CPU)
97
+ - **مخرجات النموذج**: محفوظ عند ./results
98
+ - **الاستخدام**: النصوص العربية فقط
99
  """)
100
 
101
  if __name__ == "__main__":