Spaces:
Build error
Build error
| import whisper | |
| from dotenv import load_dotenv | |
| import os | |
| from rapidfuzz import process, fuzz | |
| import pandas as pd | |
| import json | |
| import google.generativeai as genai | |
| import re | |
| from pathlib import Path | |
| # Initialized Modules | |
| from modules.mapping import mapping_employee, mapping_merchant, mapping_product, mapping_unit | |
| from modules.formatting import format_date | |
| # # Load the .env from the parent directory of this file | |
| # env_path = Path(__file__).resolve().parent.parent / ".env" | |
| # load_dotenv(dotenv_path=env_path) | |
| load_dotenv() | |
| # Trancribe Model: Whisper | |
| transcribe_model = whisper.load_model("turbo", download_root= os.getenv("XDG_CACHE_HOME", "/app/.cache")) | |
| # Load the Gemini model | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| MODEL_NAME = os.getenv("MODEL_NAME") | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| # Gemini Model | |
| LLM_model = genai.GenerativeModel(MODEL_NAME) | |
| # Line Split Function | |
| def line(): | |
| print("=" * 30) | |
| # Audio to raw text | |
| def process_audio(audio_path, transcribe_model): | |
| try: | |
| transcript = transcribe_model.transcribe(audio_path) | |
| return transcript["text"] | |
| except Exception as e: | |
| print(f"Trancribe failed: {e}") | |
| return "" | |
| # Parsing audio-text | |
| def parse_audio_text(text, extract_model): | |
| prompt = f""" | |
| Dưới đây là nội dung hóa đơn bằng tiếng Việt. Hãy trích xuất tên đại lý mua (seller), tên đại lý bán (buyer), tên sản phẩm (product_name), đơn vị tính (unit), số lượng theo từng đơn hàng (quantity), ngày đặt hàng (order_date). | |
| Văn bản: | |
| {text} | |
| Trả về kết quả dạng JSON: | |
| {{ | |
| "order_1": {{ | |
| "seller": "...", | |
| "buyer": "...", | |
| "product_name": "...", | |
| "unit": "...", | |
| "quantity": "...", | |
| "order_date": "..." | |
| }}, | |
| ... | |
| }} | |
| """ | |
| response = extract_model.generate_content(prompt) | |
| try: | |
| content = response.text | |
| # Use regex to extract the JSON part | |
| match = re.search(r"\{[\s\S]*\}", content) | |
| if match: | |
| json_str = match.group(0) | |
| extracted_json = json.loads(json_str) | |
| # Format the date string | |
| for order in extracted_json.values(): | |
| if "order_date" in order: | |
| order["order_date"] = format_date(date_str= order["order_date"]) | |
| return list(extracted_json.values()) # List of orders | |
| else: | |
| raise ValueError("No valid JSON found in Gemini output") | |
| except Exception as e: | |
| print("Failed to parse JSON from LLM response:", e) | |
| return [] | |
| # Audio Handling Function | |
| def audio_process(audio_path, order_id): | |
| print(f"Start process audio file: {os.path.basename(audio_path)}") | |
| line() | |
| # Audio to Text | |
| raw_text = process_audio( | |
| audio_path=audio_path, | |
| transcribe_model=transcribe_model | |
| ) | |
| print(f"Transcript is done. Transcription: {raw_text}") | |
| line() | |
| # Text to JSON | |
| extracted_information = parse_audio_text( | |
| text=raw_text, | |
| extract_model=LLM_model | |
| ) | |
| print(f"Extracted Information.") | |
| line() | |
| # Mapping | |
| merchant_mapped_data = mapping_merchant( | |
| information=extracted_information, | |
| json_path=os.getenv("MERCHANT_JSON_PATH"), | |
| normalization_rule=os.getenv("NORMALIZATION_RULE_PATH") | |
| ) | |
| unit_merchant_mapped_data = mapping_unit( | |
| information=merchant_mapped_data, | |
| json_path=os.getenv("UNIT_JSON_PATH"), | |
| normalization_rule=os.getenv("NORMALIZATION_RULE_PATH") | |
| ) | |
| product_unit_merchant_mapped_data = mapping_product( | |
| information= unit_merchant_mapped_data, | |
| json_path= os.getenv("PRODUCT_JSON_PATH"), | |
| normalization_rule= os.getenv("NORMALIZATION_RULE_PATH") | |
| ) | |
| # Skipping employee | |
| processed_data = product_unit_merchant_mapped_data | |
| # Assign order id | |
| for item in processed_data: | |
| item["order_id"] = order_id | |
| print(f"Successfully mapped data (merchant + unit).") | |
| line() | |
| return processed_data |