Spaces:
Build error
Build error
| from dotenv import load_dotenv | |
| import os | |
| import pandas as pd | |
| import json | |
| from google.cloud import vision | |
| import google.generativeai as genai | |
| from google.oauth2 import service_account | |
| import re | |
| from pathlib import Path | |
| # Initialized Modules | |
| from modules.mapping import mapping_employee, mapping_merchant, mapping_product, mapping_unit | |
| from modules.formatting import format_date | |
| # # Load the .env from the parent directory of this file | |
| # env_path = Path(__file__).resolve().parent.parent / ".env" | |
| # load_dotenv(dotenv_path=env_path) | |
| load_dotenv() | |
| # Load the credential for Cloud-Vision-API model | |
| service_account_info_str = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") | |
| service_account_info = json.loads(service_account_info_str) | |
| CREDENTIALS = service_account.Credentials.from_service_account_info(service_account_info) | |
| # Load the Gemini model | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| MODEL_NAME = os.getenv("MODEL_NAME") | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| # Gemini Model | |
| LLM_model = genai.GenerativeModel(MODEL_NAME) | |
| # Line Split Function | |
| def line(): | |
| print("=" * 30) | |
| # Image to raw text | |
| def process_ocr(image_path): | |
| try: | |
| client = vision.ImageAnnotatorClient(credentials=CREDENTIALS) | |
| with open(image_path, "rb") as image_file: | |
| content = image_file.read() | |
| image = vision.Image(content=content) | |
| response = client.document_text_detection(image=image) | |
| # Extract detected text | |
| texts = response.text_annotations | |
| return texts[0].description if texts else "" | |
| except Exception as e: | |
| print(f"OCR failed: {e}") | |
| return "" | |
| # Parsing image-text | |
| def parse_image_text(text, extract_model): | |
| prompt = f""" | |
| Dưới đây là nội dung hóa đơn bằng tiếng Việt. Hãy trích xuất tên đại lý mua (seller), tên đại lý bán (buyer), tên sản phẩm (product_name), đơn vị tính (unit), số lượng theo từng đơn hàng (quantity), ngày đặt hàng (order_date). | |
| Văn bản: | |
| {text} | |
| Trả về kết quả dạng JSON: | |
| {{ | |
| "order_1": {{ | |
| "seller": "...", | |
| "buyer": "...", | |
| "product_name": "...", | |
| "unit": "...", | |
| "quantity": "...", | |
| "order_date": "..." | |
| }}, | |
| ... | |
| }} | |
| """ | |
| response = extract_model.generate_content(prompt) | |
| try: | |
| content = response.text | |
| # Use regex to extract the JSON part | |
| match = re.search(r"\{[\s\S]*\}", content) | |
| if match: | |
| json_str = match.group(0) | |
| extracted_json = json.loads(json_str) | |
| # Format the date string | |
| for order in extracted_json.values(): | |
| if "order_date" in order: | |
| order["order_date"] = format_date(date_str= order["order_date"]) | |
| return list(extracted_json.values()) # List of orders | |
| else: | |
| raise ValueError("No valid JSON found in Gemini output") | |
| except Exception as e: | |
| print("Failed to parse JSON from LLM response:", e) | |
| return [] | |
| # Image Handling Function | |
| def image_process(image_path, order_id): | |
| print(f"Start process image file: {os.path.basename(image_path)}") | |
| line() | |
| # Image to Text | |
| raw_text = process_ocr(image_path=image_path) | |
| print(f"Successfully extract raw text. Text: {raw_text}") | |
| line() | |
| # Text to JSON | |
| extracted_information = parse_image_text( | |
| text=raw_text, | |
| extract_model=LLM_model | |
| ) | |
| print(f"Extracted Information.") | |
| line() | |
| # Mapping | |
| merchant_mapped_data = mapping_merchant( | |
| information=extracted_information, | |
| json_path=os.getenv("MERCHANT_JSON_PATH"), | |
| normalization_rule=os.getenv("NORMALIZATION_RULE_PATH") | |
| ) | |
| unit_merchant_mapped_data = mapping_unit( | |
| information=merchant_mapped_data, | |
| json_path=os.getenv("UNIT_JSON_PATH"), | |
| normalization_rule=os.getenv("NORMALIZATION_RULE_PATH") | |
| ) | |
| product_unit_merchant_mapped_data = mapping_product( | |
| information= unit_merchant_mapped_data, | |
| json_path= os.getenv("PRODUCT_JSON_PATH"), | |
| normalization_rule= os.getenv("NORMALIZATION_RULE_PATH") | |
| ) | |
| # Skipping employee | |
| processed_data = product_unit_merchant_mapped_data | |
| # Assign order id | |
| for item in processed_data: | |
| item["order_id"] = order_id | |
| print(f"Successfully mapped data (merchant + unit).") | |
| line() | |
| return processed_data | |