| | |
| | """text_classification.ipynb |
| | |
| | Automatically generated by Colab. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/drive/1D25W7EYF5v1a0FoSHKAcyVhwMMIU6yg4 |
| | """ |
| |
|
| | !pip install transformers datasets |
| | !pip install torch |
| |
|
| | |
| | import pandas as pd |
| | import torch |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments |
| | from sklearn.preprocessing import LabelEncoder |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.metrics import accuracy_score, classification_report |
| | import joblib |
| | import numpy as np |
| | from collections import Counter |
| |
|
| | |
| | print("Loading and preprocessing data...") |
| | df = pd.read_excel('/content/Copy ofمنتجات مقاهي (1).xlsx', sheet_name='products') |
| | df = df[['اسم المنتج', 'التصنيف المحاسبي']].dropna() |
| |
|
| | |
| | label_encoder = LabelEncoder() |
| | labels = label_encoder.fit_transform(df['التصنيف المحاسبي']) |
| | texts = df['اسم المنتج'].tolist() |
| |
|
| | print(f"Loaded {len(texts)} products with {len(set(labels))} unique categories.") |
| | print(f"Categories: {list(label_encoder.classes_)}") |
| |
|
| | |
| | from collections import Counter |
| | label_counts = Counter(labels) |
| | print(f"Class distribution:") |
| | for label_id, count in sorted(label_counts.items()): |
| | label_name = label_encoder.inverse_transform([label_id])[0] |
| | print(f" {label_name}: {count} samples") |
| |
|
| | |
| | single_sample_mask = np.array([label_counts[label] == 1 for label in labels]) |
| | multi_sample_mask = ~single_sample_mask |
| |
|
| | |
| | single_indices = np.where(single_sample_mask)[0] |
| | multi_indices = np.where(multi_sample_mask)[0] |
| |
|
| | print(f"\nSingle-sample classes: {np.sum(single_sample_mask)} samples") |
| | print(f"Multi-sample classes: {np.sum(multi_sample_mask)} samples") |
| |
|
| | if np.sum(multi_sample_mask) > 0: |
| | |
| | multi_texts = [texts[i] for i in multi_indices] |
| | multi_labels = [labels[i] for i in multi_indices] |
| |
|
| | train_texts, val_texts, train_labels, val_labels = train_test_split( |
| | multi_texts, multi_labels, test_size=0.2, random_state=42, stratify=multi_labels |
| | ) |
| |
|
| | |
| | if np.sum(single_sample_mask) > 0: |
| | single_texts = [texts[i] for i in single_indices] |
| | single_labels = [labels[i] for i in single_indices] |
| |
|
| | train_texts.extend(single_texts) |
| | train_labels.extend(single_labels) |
| |
|
| | print(f"Added {len(single_texts)} single-sample items to training set") |
| | else: |
| | |
| | print("Warning: All or most classes have single samples. Using simple split.") |
| | train_texts, val_texts, train_labels, val_labels = train_test_split( |
| | texts, labels, test_size=0.2, random_state=42 |
| | ) |
| |
|
| | print(f"Training set: {len(train_texts)} samples") |
| | print(f"Validation set: {len(val_texts)} samples") |
| |
|
| | |
| | model_name = "asafaya/bert-base-arabic" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels))) |
| |
|
| | |
| | class SimpleDataset(torch.utils.data.Dataset): |
| | def __init__(self, texts, labels, tokenizer): |
| | self.texts = texts |
| | self.labels = labels |
| | self.tokenizer = tokenizer |
| |
|
| | def __len__(self): |
| | return len(self.texts) |
| |
|
| | def __getitem__(self, idx): |
| | encoding = self.tokenizer( |
| | str(self.texts[idx]), |
| | truncation=True, |
| | padding='max_length', |
| | max_length=128, |
| | return_tensors='pt' |
| | ) |
| | return { |
| | 'input_ids': encoding['input_ids'].squeeze(0), |
| | 'attention_mask': encoding['attention_mask'].squeeze(0), |
| | 'labels': torch.tensor(self.labels[idx], dtype=torch.long) |
| | } |
| |
|
| | |
| | train_dataset = SimpleDataset(train_texts, train_labels, tokenizer) |
| | val_dataset = SimpleDataset(val_texts, val_labels, tokenizer) |
| |
|
| | |
| | def compute_metrics(eval_pred): |
| | predictions, labels = eval_pred |
| | predictions = np.argmax(predictions, axis=1) |
| | accuracy = accuracy_score(labels, predictions) |
| | return {'accuracy': accuracy} |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir='./model', |
| | num_train_epochs=50, |
| | per_device_train_batch_size=16, |
| | per_device_eval_batch_size=16, |
| | eval_strategy="epoch", |
| | save_strategy="epoch", |
| | logging_steps=10, |
| | save_total_limit=2, |
| | load_best_model_at_end=True, |
| | metric_for_best_model="eval_accuracy", |
| | greater_is_better=True, |
| | report_to=None, |
| | warmup_steps=100, |
| | weight_decay=0.01, |
| | learning_rate=2e-5, |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=val_dataset, |
| | tokenizer=tokenizer, |
| | compute_metrics=compute_metrics |
| | ) |
| |
|
| | |
| | print("Training started with evaluation...") |
| | trainer.train() |
| |
|
| | |
| | trainer.save_model('./model') |
| | tokenizer.save_pretrained('./model') |
| | joblib.dump(label_encoder, './model/labels.pkl') |
| |
|
| | print("Training complete! Model saved to './model'") |
| |
|
| | |
| | def predict(text): |
| | """Predict single product classification""" |
| | tokenizer = AutoTokenizer.from_pretrained('./model') |
| | model = AutoModelForSequenceClassification.from_pretrained('./model') |
| | label_encoder = joblib.load('./model/labels.pkl') |
| |
|
| | inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | predicted_id = outputs.logits.argmax().item() |
| | confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() |
| | classification = label_encoder.inverse_transform([predicted_id])[0] |
| |
|
| | return classification, confidence |
| |
|
| | def predict_batch(texts): |
| | """Predict multiple products at once for faster processing""" |
| | tokenizer = AutoTokenizer.from_pretrained('./model') |
| | model = AutoModelForSequenceClassification.from_pretrained('./model') |
| | label_encoder = joblib.load('./model/labels.pkl') |
| |
|
| | inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | predictions = outputs.logits.argmax(dim=-1).cpu().numpy() |
| | confidences = torch.nn.functional.softmax(outputs.logits, dim=-1).max(dim=-1)[0].cpu().numpy() |
| | classifications = label_encoder.inverse_transform(predictions) |
| |
|
| | return list(zip(classifications, confidences)) |
| |
|
| | |
| | print("\nEvaluating on validation set...") |
| | val_predictions = [] |
| | val_confidences = [] |
| |
|
| | for text in val_texts: |
| | pred, conf = predict(text) |
| | val_predictions.append(pred) |
| | val_confidences.append(conf) |
| |
|
| | |
| | val_pred_numeric = label_encoder.transform(val_predictions) |
| | accuracy = accuracy_score(val_labels, val_pred_numeric) |
| | print(f"Validation Accuracy: {accuracy:.4f}") |
| |
|
| | |
| | val_true_labels = label_encoder.inverse_transform(val_labels) |
| | print("\nDetailed Classification Report:") |
| | print(classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_)) |
| |
|
| | |
| | test_products = [ |
| | "نادك حليب طويل الأجل 1 لتر", |
| | "قهوة عربية محمصة", |
| | "شاي أحمر ليبتون", |
| | "عصير برتقال طبيعي" |
| | ] |
| |
|
| | print("\n" + "="*50) |
| | print("Testing on sample products:") |
| | print("="*50) |
| |
|
| | for product in test_products: |
| | result, confidence = predict(product) |
| | print(f"Product: {product}") |
| | print(f"Classification: {result}") |
| | print(f"Confidence: {confidence:.3f}") |
| | print("-" * 30) |
| |
|
| | |
| | print("\nBatch prediction example:") |
| | batch_results = predict_batch(test_products) |
| | for product, (classification, confidence) in zip(test_products, batch_results): |
| | print(f"{product} -> {classification} ({confidence:.3f})") |
| |
|
| | print(f"\nModel training complete!") |
| | print(f"- Single prediction: predict('product name')") |
| | print(f"- Batch prediction: predict_batch(['product1', 'product2', ...])") |
| | print(f"- Validation accuracy: {accuracy:.4f}") |
| | print(f"- Model saved to: './model'") |
| |
|
| | |
| | import torch |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | import joblib |
| |
|
| | print("Loading trained model...") |
| |
|
| | |
| | try: |
| | tokenizer = AutoTokenizer.from_pretrained('./model') |
| | model = AutoModelForSequenceClassification.from_pretrained('./model') |
| | label_encoder = joblib.load('./model/labels.pkl') |
| | print("Model loaded successfully!") |
| | print(f"Number of available categories: {len(label_encoder.classes_)}") |
| |
|
| | |
| | print("\nAvailable categories:") |
| | for i, category in enumerate(label_encoder.classes_, 1): |
| | print(f"{i:2d}. {category}") |
| |
|
| | except Exception as e: |
| | print(f"Error loading model: {e}") |
| | print("Make sure './model' folder exists and contains required files") |
| | exit() |
| |
|
| | |
| | def classify_product(product_name): |
| | """Classify a single product""" |
| | try: |
| | |
| | inputs = tokenizer( |
| | product_name, |
| | return_tensors="pt", |
| | truncation=True, |
| | padding=True, |
| | max_length=128 |
| | ) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | |
| | predicted_id = outputs.logits.argmax().item() |
| | confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() |
| | classification = label_encoder.inverse_transform([predicted_id])[0] |
| |
|
| | return { |
| | 'product': product_name, |
| | 'classification': classification, |
| | 'confidence': confidence, |
| | 'success': True |
| | } |
| |
|
| | except Exception as e: |
| | return { |
| | 'product': product_name, |
| | 'classification': None, |
| | 'confidence': 0, |
| | 'success': False, |
| | 'error': str(e) |
| | } |
| |
|
| | |
| | def classify_multiple_products(product_list): |
| | """Classify a list of products""" |
| | results = [] |
| |
|
| | print(f"Classifying {len(product_list)} products...") |
| |
|
| | for i, product in enumerate(product_list, 1): |
| | result = classify_product(product) |
| | results.append(result) |
| |
|
| | if result['success']: |
| | print(f"{i:3d}. {product}") |
| | print(f" → {result['classification']}") |
| | print(f" → Confidence: {result['confidence']:.3f}") |
| | else: |
| | print(f"{i:3d}. {product} - Error: {result['error']}") |
| | print() |
| |
|
| | return results |
| |
|
| | |
| | test_products = [ |
| | "نادك حليب طويل الأجل 1 لتر", |
| | "قهوة عربية محمصة", |
| | "شاي أحمر ليبتون", |
| | "منظف أرضيات فلاش", |
| | "سكر أبيض ناعم", |
| | "عصير برتقال طبيعي" |
| | ] |
| |
|
| | print("\n" + "="*60) |
| | print("Testing model on sample products") |
| | print("="*60) |
| |
|
| | |
| | test_results = classify_multiple_products(test_products) |
| |
|
| | |
| | successful_predictions = [r for r in test_results if r['success']] |
| | avg_confidence = sum(r['confidence'] for r in successful_predictions) / len(successful_predictions) |
| |
|
| | print("="*60) |
| | print("Results summary:") |
| | print(f"Successfully classified {len(successful_predictions)} products") |
| | print(f"Average confidence level: {avg_confidence:.3f}") |
| |
|
| | |
| | unique_classifications = set(r['classification'] for r in successful_predictions) |
| | print(f"Number of categories used: {len(unique_classifications)}") |
| | print("Categories:") |
| | for classification in sorted(unique_classifications): |
| | count = sum(1 for r in successful_predictions if r['classification'] == classification) |
| | print(f" • {classification} ({count} products)") |
| |
|
| | print("\n" + "="*60) |
| | print("Model ready for use!") |
| | print("="*60) |
| | print("Usage:") |
| | print("result = classify_product('product name')") |
| | print("print(f\"Classification: {result['classification']}\")") |
| | print("print(f\"Confidence: {result['confidence']:.3f}\")") |
| |
|
| | print("\nFor multiple products:") |
| | print("products = ['product 1', 'product 2', 'product 3']") |
| | print("results = classify_multiple_products(products)") |
| |
|
| | test_product = 'عطر كروم ليجند للرجال او دي تواليت من ازارو 125 مل' |
| | result, confidence = predict(test_product) |
| |
|
| | print(f"\nTest: {test_product}") |
| | print(f"Result: {result}") |
| | print(f"Confidence: {confidence:.3f}") |
| |
|
| | """# Saving The model""" |
| |
|
| | |
| | model.save_pretrained('/content/my_model/') |
| |
|
| | |
| | from transformers import BertForSequenceClassification |
| | model = BertForSequenceClassification.from_pretrained('/content/my_model/') |
| |
|
| | !zip -r my_model.zip /content/my_model/ |
| |
|
| | tokenizer.save_pretrained('/content/my_model') |
| | model.save_pretrained('/content/my_model') |
| | import joblib |
| | joblib.dump(label_encoder, '/content/my_model/labels.pkl') |
| |
|
| | from google.colab import files |
| | files.download('my_model.zip') |
| |
|
| | """# Testing""" |
| |
|
| | !ls /content/my_model |
| |
|
| |
|
| |
|
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | import torch |
| | import joblib |
| |
|
| | |
| | save_path = '/content/my_model' |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(save_path) |
| | model = AutoModelForSequenceClassification.from_pretrained(save_path) |
| | label_encoder = joblib.load(f'{save_path}/labels.pkl') |
| |
|
| | def predict(text): |
| | |
| | inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | |
| | predicted_id = outputs.logits.argmax().item() |
| | confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() |
| |
|
| | |
| | classification = label_encoder.inverse_transform([predicted_id])[0] |
| |
|
| | return classification, confidence |
| |
|
| | |
| | test_product = "نادك حليب طويل الأجل 1 لتر" |
| | result, confidence = predict(test_product) |
| |
|
| | print(f"Test Product: {test_product}") |
| | print(f"Predicted Category: {result}") |
| | print(f"Confidence: {confidence:.3f}") |
| |
|
| | |
| | test_product = "زبادى" |
| | result, confidence = predict(test_product) |
| |
|
| | print(f"Test Product: {test_product}") |
| | print(f"Predicted Category: {result}") |
| | print(f"Confidence: {confidence:.3f}") |
| |
|
| | |
| | test_product = "بترول" |
| | result, confidence = predict(test_product) |
| |
|
| | print(f"Test Product: {test_product}") |
| | print(f"Predicted Category: {result}") |
| | print(f"Confidence: {confidence:.3f}") |
| |
|
| | from google.colab import files |
| | uploaded = files.upload() |
| |
|
| |
|