Spaces:

Mahmoud-Dev
/

DistilBERT-Sentiment-Training

Sleeping

App Files Files Community

Mahmoud-Dev commited on 17 days ago

Commit

1e900c3

verified ·

1 Parent(s): 4b0008c

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -33

app.py CHANGED Viewed

@@ -1,29 +1,36 @@
 import gradio as gr
 import torch
 from datasets import load_dataset
-from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
 import numpy as np
-# Load the sentiment dataset
-dataset = load_dataset('k1tub/sentiment_dataset')
-print(f"Dataset loaded with {len(dataset['train'])} training examples")
-# Load tokenizer and model
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
 def preprocess_function(examples):
-    # Tokenize the text
     encoding = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
     # Map label to indices
-    encoding['labels'] = examples['label']
     return encoding
 # Preprocess the dataset
 tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text'])
 def train_model(epochs, batch_size, learning_rate):
-    """Fine-tune DistilBERT on the sentiment dataset"""
     try:
         training_args = TrainingArguments(
             output_dir='./results',
@@ -40,43 +47,43 @@ def train_model(epochs, batch_size, learning_rate):
             model=model,
             args=training_args,
             train_dataset=tokenized_dataset['train'],
-            eval_dataset=tokenized_dataset['validation'] if 'validation' in tokenized_dataset else tokenized_dataset['train'],
         )
         # Start training
         trainer.train()
-        return "\u270d✅ Training completed successfully!\n" + \
-                f"Model saved to ./results\nFinal learning rate: {learning_rate}\nEpochs: {epochs}\nBatch size: {batch_size}"
     except Exception as e:
-        return f"❌ Error during training: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="DistilBERT Sentiment Training") as demo:
     gr.Markdown("""
-    # 🚀 DistilBERT Sentiment Analysis Training
-    Fine-tune **DistilBERT** model on the **k1tub/sentiment_dataset** (290k examples)
-    ### Model Info:
-    - **Base Model**: distilbert-base-uncased (67M parameters)
-    - **Task**: Text Classification (Sentiment Analysis)
-    - **Dataset**: k1tub/sentiment_dataset
-    - **Framework**: Hugging Face Transformers
-    """)
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Training Configuration")
-            epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
             batch_size = gr.Slider(minimum=8, maximum=64, value=32, step=8, label="Batch Size")
             learning_rate = gr.Slider(minimum=1e-5, maximum=1e-3, value=2e-5, step=1e-5, label="Learning Rate")
         with gr.Column():
-            gr.Markdown("### Training Status")
-            output_text = gr.Textbox(label="Output", lines=10, interactive=False)
-    train_button = gr.Button("🔥 Start Training", variant="primary", scale=2)
     train_button.click(
         fn=train_model,
         inputs=[epochs, batch_size, learning_rate],
@@ -84,11 +91,11 @@ with gr.Blocks(title="DistilBERT Sentiment Training") as demo:
     )
     gr.Markdown("""
-    ### Training Details:
-    - **Free Hardware**: CPU Basic on Hugging Face Spaces
-    - **Training Time**: Depends on dataset size and hardware
-    - **Model Output**: Saved to ./results folder
-    - **Inference**: Can be deployed as a separate Space
     """)
 if __name__ == "__main__":

 import gradio as gr
 import torch
 from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 import numpy as np
+# Load the Arabic sentiment dataset (Saudi dialect from Twitter)
+try:
+    dataset = load_dataset('arbml/Arabic_Sentiment_Twitter_Corpus')
+    print(f"Dataset loaded with {len(dataset['train'])} training examples")
+except:
+    print("Loading alternative Arabic dataset...")
+    dataset = load_dataset('asas-ai/Arabic_Sentiment_Twitter_Corpus')
+# Load tokenizer and model (supports Arabic)
+tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')
+model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=3)
 def preprocess_function(examples):
+    # Tokenize the Arabic text
     encoding = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
     # Map label to indices
+    if 'label' in examples:
+        encoding['labels'] = examples['label']
+    elif 'sentiment' in examples:
+        encoding['labels'] = examples['sentiment']
     return encoding
 # Preprocess the dataset
 tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text'])
 def train_model(epochs, batch_size, learning_rate):
+    """Fine-tune DistilBERT on Arabic sentiment dataset (Saudi dialect)"""
     try:
         training_args = TrainingArguments(
             output_dir='./results',
             model=model,
             args=training_args,
             train_dataset=tokenized_dataset['train'],
+            eval_dataset=tokenized_dataset.get('validation', tokenized_dataset['train']),
         )
         # Start training
         trainer.train()
+        return "\u270d✅ \u062aم التدريب بنجاح!\n" + \
+                f"النموذج محفوظ في ./results\nمعدل التعلم: {learning_rate}\nعدد الحقب: {epochs}\nBatch Size: {batch_size}"
     except Exception as e:
+        return f"❌ خطأ أثناء التدريب: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="DistilBERT Arabic Sentiment Training") as demo:
     gr.Markdown("""
+    # 🚀 تدريب نموذج DistilBERT العربي
+    ضبط نموذج **DistilBERT** على تحليل المشاعر باللغة العربية (اللهجة السعودية)
+    ### معلومات النموذج:
+    - **النموذج الأساسي**: distilbert-base-multilingual-cased (67M معامل)
+    - **المهمة**: تصنيف النصوص (المتعد اللغات)
+    - **قاعدة البيانات**: arbml/Arabic_Sentiment_Twitter_Corpus (58.8k مثال)
+    - **اللغة**: العربية (اللهجة السعودية والخليجية)
+    """)
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### إعدادات التدريب")
+            epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="\u0639دد الحقب (Epochs)")
             batch_size = gr.Slider(minimum=8, maximum=64, value=32, step=8, label="Batch Size")
             learning_rate = gr.Slider(minimum=1e-5, maximum=1e-3, value=2e-5, step=1e-5, label="Learning Rate")
         with gr.Column():
+            gr.Markdown("### حالة التدريب")
+            output_text = gr.Textbox(label="المخرجات", lines=10, interactive=False)
+    train_button = gr.Button("🔥 بدء التدريب", variant="primary", scale=2)
     train_button.click(
         fn=train_model,
         inputs=[epochs, batch_size, learning_rate],
     )
     gr.Markdown("""
+    ### تفاصيل التدريب:
+    - **مرحلة البناء**: GPU مجاني (مباشر عبر Hugging Face Spaces)
+    - **وقت المتوقع**: 5-10 دقائق (GPU) أو 15-20 دقيقة (CPU)
+    - **مخرجات النموذج**: محفوظ عند ./results
+    - **الاستخدام**: النصوص العربية فقط
     """)
 if __name__ == "__main__":