Spaces:

cochi1706
/

TextClassification

Sleeping

App Files Files Community

cochi1706 commited on Nov 15, 2025

Commit

12c8e7c

1 Parent(s): 9cef669

Refactor text classification logic to dynamically set max_length based on model configuration and streamline tokenization process, enhancing error handling with detailed traceback.

Browse files

Files changed (1) hide show

app.py +48 -53

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from torch.utils.data import Dataset, DataLoader
 # Định nghĩa các nhãn
 LABELS = ['Thế giới', 'Văn hóa', 'Chính trị Xã hội', 'Vi tính', 'Đời sống',
@@ -25,31 +24,18 @@ except Exception as e:
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 model.to(device)
 model.eval()
-print("Model đã được tải thành công!")
-# Dataset class cho inference
-class TextDataset(Dataset):
-    def __init__(self, texts, tokenizer, max_length=512):
-        self.texts = texts
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-    def __len__(self):
-        return len(self.texts)
-    def __getitem__(self, idx):
-        text = str(self.texts[idx])
-        encoding = self.tokenizer(
-            text,
-            truncation=True,
-            padding='max_length',
-            max_length=self.max_length,
-            return_tensors='pt'
-        )
-        return {
-            'input_ids': encoding['input_ids'].flatten(),
-            'attention_mask': encoding['attention_mask'].flatten()
-        }
 def classify_text(text):
     """
@@ -59,39 +45,48 @@ def classify_text(text):
         return "Vui lòng nhập văn bản cần phân loại!"
     try:
-        # Tạo dataset và dataloader
-        dataset = TextDataset([text], tokenizer)
-        dataloader = DataLoader(dataset, batch_size=1)
         # Dự đoán
         with torch.no_grad():
-            for batch in dataloader:
-                batch = {k: v.to(device) for k, v in batch.items()}
-                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
-                pred_label_id = torch.argmax(outputs.logits, dim=1).item()
-                # Lấy xác suất cho tất cả các lớp
-                probabilities = torch.softmax(outputs.logits, dim=1)[0]
-                # Tạo kết quả
-                predicted_label = LABELS[pred_label_id]
-                confidence = probabilities[pred_label_id].item() * 100
-                # Tạo danh sách xác suất cho tất cả các nhãn
-                results = []
-                for i, label in enumerate(LABELS):
-                    prob = probabilities[i].item() * 100
-                    results.append(f"{label}: {prob:.2f}%")
-                result_text = f"**Nhãn dự đoán: {predicted_label}**\n"
-                result_text += f"**Độ tin cậy: {confidence:.2f}%**\n\n"
-                result_text += "**Xác suất cho tất cả các nhãn:**\n"
-                result_text += "\n".join(results)
-                return result_text
     except Exception as e:
-        return f"Lỗi khi phân loại: {str(e)}"
 # Tạo giao diện Gradio
 with gr.Blocks(title="Phân loại văn bản tiếng Việt", theme=gr.themes.Soft()) as demo:

 import gradio as gr
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 # Định nghĩa các nhãn
 LABELS = ['Thế giới', 'Văn hóa', 'Chính trị Xã hội', 'Vi tính', 'Đời sống',
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 model.to(device)
 model.eval()
+# Lấy max_length từ model config (nếu có) hoặc dùng giá trị mặc định
+# Dựa trên lỗi, model có vẻ được train với max_length=258
+try:
+    if hasattr(model.config, 'max_position_embeddings'):
+        max_length = min(model.config.max_position_embeddings, 258)
+    else:
+        max_length = 258  # Giá trị dựa trên lỗi
+except:
+    max_length = 258  # Giá trị mặc định dựa trên lỗi
+print(f"Model đã được tải thành công! Max length: {max_length}")
 def classify_text(text):
     """
         return "Vui lòng nhập văn bản cần phân loại!"
     try:
+        # Tokenize văn bản
+        # Model có vẻ được train với max_length=258, nên cần pad đến đúng độ dài này
+        encoding = tokenizer(
+            text,
+            truncation=True,
+            padding='max_length',
+            max_length=max_length,
+            return_tensors='pt'
+        )
+        # Chuyển sang device
+        input_ids = encoding['input_ids'].to(device)
+        attention_mask = encoding['attention_mask'].to(device)
         # Dự đoán
         with torch.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            pred_label_id = torch.argmax(outputs.logits, dim=1).item()
+            # Lấy xác suất cho tất cả các lớp
+            probabilities = torch.softmax(outputs.logits, dim=1)[0]
+            # Tạo kết quả
+            predicted_label = LABELS[pred_label_id]
+            confidence = probabilities[pred_label_id].item() * 100
+            # Tạo danh sách xác suất cho tất cả các nhãn
+            results = []
+            for i, label in enumerate(LABELS):
+                prob = probabilities[i].item() * 100
+                results.append(f"{label}: {prob:.2f}%")
+            result_text = f"**Nhãn dự đoán: {predicted_label}**\n"
+            result_text += f"**Độ tin cậy: {confidence:.2f}%**\n\n"
+            result_text += "**Xác suất cho tất cả các nhãn:**\n"
+            result_text += "\n".join(results)
+            return result_text
     except Exception as e:
+        import traceback
+        return f"Lỗi khi phân loại: {str(e)}\n\nTraceback: {traceback.format_exc()}"
 # Tạo giao diện Gradio
 with gr.Blocks(title="Phân loại văn bản tiếng Việt", theme=gr.themes.Soft()) as demo: