Spaces:

eoeooe
/

Gooo

Runtime error

App Files Files Community

eoeooe commited on Sep 2, 2025

Commit

6b549b4

verified ·

1 Parent(s): 1ee2fbb

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -14

app.py CHANGED Viewed

@@ -4,42 +4,46 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
-# โหลดโมเดล transformers ภาษาไทย (WangchanBERT) พร้อมตั้ง use_fast=False
-model_name = "airesearch/wangchanberta-base-att-spm-uncased"
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 def ocr_and_classify(image):
-    # OCR อ่านข้อความจากภาพ ภาษาไทย+อังกฤษ
     text = pytesseract.image_to_string(image, lang="tha+eng")
     if not text.strip():
         return "❌ ไม่พบข้อความในภาพ"
-    # Tokenize ข้อความเพื่อส่งเข้าโมเดล
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     outputs = model(**inputs)
-    # แปลง logits เป็น probabilities และหา class ที่มีคะแนนสูงสุด
     probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
     pred_class_idx = torch.argmax(probs).item()
     confidence = probs[0][pred_class_idx].item()
-    # กำหนดชื่อคลาสเอง (ต้องแก้ตามโมเดลจริงที่ใช้)
-    labels = ["คลาส 0", "คลาส 1", "คลาส 2"]  # ตัวอย่าง
     result = (
         f"📄 ข้อความที่อ่านได้:\n{text}\n\n"
         f"📝 การจำแนกข้อความ:\n{labels[pred_class_idx]} (ความมั่นใจ {confidence:.2%})"
     )
     return result
 iface = gr.Interface(
     fn=ocr_and_classify,
-    inputs=gr.Image(type="pil", label="อัปโหลดภาพสลิปหรือใบเสร็จ"),
     outputs=gr.Textbox(lines=15, label="ผลลัพธ์"),
-    title="OCR + วิเคราะห์ข้อความสลิป ด้วย pytesseract + Transformers",
-    description="อ่านข้อความจากภาพด้วย pytesseract แล้วใช้โมเดล transformers วิเคราะห์ข้อความ"
 )
 if __name__ == "__main__":

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
+# ✅ โมเดลนี้โหลดได้จริง + พร้อมใช้งาน
+model_name = "thainlp/bert-base-thai-snips"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
+# Label map จากโมเดล (SNIPS dataset มี intent เช่น สั่งอาหาร, ตรวจอากาศ)
+labels = [
+    "AddToPlaylist", "BookRestaurant", "GetWeather",
+    "PlayMusic", "RateBook", "SearchCreativeWork",
+    "SearchScreeningEvent"
+]
 def ocr_and_classify(image):
+    # OCR อ่านข้อความไทย + อังกฤษ
     text = pytesseract.image_to_string(image, lang="tha+eng")
     if not text.strip():
         return "❌ ไม่พบข้อความในภาพ"
+    # วิเคราะห์ข้อความด้วย BERT
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     outputs = model(**inputs)
     probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
     pred_class_idx = torch.argmax(probs).item()
     confidence = probs[0][pred_class_idx].item()
     result = (
         f"📄 ข้อความที่อ่านได้:\n{text}\n\n"
         f"📝 การจำแนกข้อความ:\n{labels[pred_class_idx]} (ความมั่นใจ {confidence:.2%})"
     )
     return result
+# Gradio UI
 iface = gr.Interface(
     fn=ocr_and_classify,
+    inputs=gr.Image(type="pil", label="อัปโหลดภาพ"),
     outputs=gr.Textbox(lines=15, label="ผลลัพธ์"),
+    title="OCR + วิเคราะห์ข้อความด้วย BERT (Thai)",
+    description="อ่านข้อความจากภาพด้วย pytesseract แล้ววิเคราะห์ด้วย BERT ที่รองรับภาษาไทย"
 )
 if __name__ == "__main__":