Spaces:

ivanhoang
/

email-order-extractor

Build error

App Files Files Community

ivanhoang commited on Sep 19, 2025

Commit

5564d41

verified ·

1 Parent(s): 820e42a

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -20

app.py CHANGED Viewed

@@ -4,34 +4,39 @@ from datetime import datetime
 import json
 import io
 from PIL import Image
-import pytesseract # THAY ĐỔI 1: Thư viện OCR mới
-from ctransformers import AutoModelForCausalLM as CAutoModelForCausalLM
-# --- CẤU HÌNH VÀ TẢI MÔ HÌNH (PHIÊN BẢN SIÊU NHẸ) ---
 print("Ứng dụng đang khởi động...")
-# THAY ĐỔI 2: XÓA HOÀN TOÀN KHỐI TẢI MÔ HÌNH OCR NẶNG NỀ
 print("Sử dụng Tesseract OCR (siêu nhẹ).")
-# DÙNG CTRANSFORMERS ĐỂ TẢI GGUF (Giữ nguyên)
 print("Đang tải mô hình LLM (Llama-3-8B GGUF for CPU)...")
-llm_model_id = "bartowski/Meta-Llama-3-8B-Instruct-GGUF"
-llm_model_file = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
-llm = CAutoModelForCausalLM.from_pretrained(
-    llm_model_id,
-    model_file=llm_model_file,
-    model_type="llama",
-    gpu_layers=0,
-    context_length=4096
 )
-print("Tải xong mô hình LLM.")
 # --- CÁC HÀM XỬ LÝ ---
 def run_ocr(image: Image.Image) -> str:
-    # THAY ĐỔI 3: VIẾT LẠI HOÀN TOÀN HÀM OCR
-    """Hàm chạy Tesseract OCR để đọc chữ từ ảnh"""
     try:
         text = pytesseract.image_to_string(image)
         return text
@@ -40,13 +45,23 @@ def run_ocr(image: Image.Image) -> str:
         return "Lỗi khi đọc chữ từ ảnh."
 def extract_order_from_text(text: str) -> dict:
-    # (Giữ nguyên không thay đổi)
     prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
     You are an expert assistant that only outputs valid JSON. Extract order information from the text. The JSON object must contain "ten_khach_hang" (string, null if not found) and "danh_sach_hang" (an array of items). Each item must have "ten_hang" (string), "so_luong" (number), "don_vi" (string), "ma_hang" (string, null if not found), and "ghi_chu" (string, null if not found).<|eot_id|><|start_header_id|>user<|end_header_id|>
     Text Content:
     {text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
     """
-    response_text = llm(prompt, max_new_tokens=1024, temperature=0.1, stop=["<|eot_id|>"])
     try:
         json_str = response_text.strip()
         start = json_str.find('{')
@@ -58,7 +73,7 @@ def extract_order_from_text(text: str) -> dict:
         return {"error": "AI trả về định dạng không hợp lệ", "raw_response": response_text}
 def create_excel_file(order_data: dict):
-    # (Giữ nguyên không thay đổi)
     if not order_data or "danh_sach_hang" not in order_data or not order_data["danh_sach_hang"]: return None
     flat_data = []
     customer = order_data.get('ten_khach_hang', 'N/A')
@@ -76,7 +91,7 @@ def create_excel_file(order_data: dict):
     return (filename, output.getvalue())
 def process_image_and_extract(image):
-    # (Giữ nguyên không thay đổi)
     try:
         if image is None: return "Vui lòng dán ảnh vào.", None, None
         extracted_text = run_ocr(image)

 import json
 import io
 from PIL import Image
+import pytesseract
+from huggingface_hub import hf_hub_download # Thư viện mới
+from llama_cpp import Llama # Thư viện mới
+# --- CẤU HÌNH VÀ TẢI MÔ HÌNH (SỬ DỤNG LLAMA.CPP) ---
 print("Ứng dụng đang khởi động...")
+# OCR không thay đổi
 print("Sử dụng Tesseract OCR (siêu nhẹ).")
+# THAY ĐỔI LỚN: DÙNG LLAMA-CPP-PYTHON
 print("Đang tải mô hình LLM (Llama-3-8B GGUF for CPU)...")
+model_repo = "bartowski/Meta-Llama-3-8B-Instruct-GGUF"
+model_file = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+# Tải file mô hình về cache của Space
+model_path = hf_hub_download(repo_id=model_repo, filename=model_file)
+print(f"Đã tải xong file mô hình tại: {model_path}")
+# Khởi tạo mô hình từ file đã tải
+llm = Llama(
+  model_path=model_path,
+  n_ctx=4096,     # Context length
+  n_gpu_layers=0, # Chạy hoàn toàn trên CPU
+  verbose=True,   # In ra thông tin để debug
 )
+print("Tải xong và khởi tạo thành công mô hình LLM.")
 # --- CÁC HÀM XỬ LÝ ---
 def run_ocr(image: Image.Image) -> str:
+    # (Giữ nguyên)
     try:
         text = pytesseract.image_to_string(image)
         return text
         return "Lỗi khi đọc chữ từ ảnh."
 def extract_order_from_text(text: str) -> dict:
+    # Cập nhật prompt và cách gọi cho Llama.cpp
     prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
     You are an expert assistant that only outputs valid JSON. Extract order information from the text. The JSON object must contain "ten_khach_hang" (string, null if not found) and "danh_sach_hang" (an array of items). Each item must have "ten_hang" (string), "so_luong" (number), "don_vi" (string), "ma_hang" (string, null if not found), and "ghi_chu" (string, null if not found).<|eot_id|><|start_header_id|>user<|end_header_id|>
     Text Content:
     {text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
     """
+    output = llm(
+        prompt,
+        max_tokens=1024,
+        stop=["<|eot_id|>"],
+        temperature=0.1,
+        echo=False # Không in lại prompt trong kết quả
+    )
+    response_text = output['choices'][0]['text']
     try:
         json_str = response_text.strip()
         start = json_str.find('{')
         return {"error": "AI trả về định dạng không hợp lệ", "raw_response": response_text}
 def create_excel_file(order_data: dict):
+    # (Giữ nguyên)
     if not order_data or "danh_sach_hang" not in order_data or not order_data["danh_sach_hang"]: return None
     flat_data = []
     customer = order_data.get('ten_khach_hang', 'N/A')
     return (filename, output.getvalue())
 def process_image_and_extract(image):
+    # (Giữ nguyên)
     try:
         if image is None: return "Vui lòng dán ảnh vào.", None, None
         extracted_text = run_ocr(image)