File size: 5,379 Bytes

358a08a

import base64
import json
import re
import requests
import os
import time

API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"

cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR2"
output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR2\cr2_json"

os.makedirs(output_json_folder, exist_ok=True)

prompt = """
Extract the following fields from the CR3 document image. Return both Arabic text and English text where available:

Fields to extract (in Arabic):
- الرقم
- التاريخ
- الرقم الموحد للمنشأة
- اسم التاجر
- الجنسية
- تاريخ الميلاد
- رقم السجل المدني-الإقامة
- تاريخه
- مصدرة
- مركزها الرئيسي
- هاتف
- الرمز البريدي
- رقم سجل المركز الرئيسي
- الاسم التجاري للفرع
- العنوان
- الرمز البريدي
- الهاتف
- النشاط
- رأس المال
- إسم المدير او الوكيل المفوض
- الجنسية
- تاريخ الميلاد
- رقم السجل المدني-الإقامة
- تاريخه
- مصدره
- سلطات المدير
- يشهد مكتب السجل التجاري بمدينة
- بأنه تم تسجيل المؤسسة المذكورة أعلاة بمدينة
- وتنتهي صلاحية الشهادات في
- بموجب الإيصال رقم
- وتاريخ
- مدير السجل التجاري

Return the result as JSON with the following keys in English:

{
  "document_number": null,
  "document_date": null,
  "unified_establishment_number": null,
  "merchant_name": null,
  "nationality": null,
  "birth_date": null,
  "national_id": null,
  "national_id_issue_date": null,
  "national_id_issue_place": null,
  "main_office": null,
  "phone": null,
  "postal_code": null,
  "main_office_registry_number": null,
  "branch_commercial_name": null,
  "branch_address": null,
  "branch_postal_code": null,
  "branch_phone": null,
  "business_activity": null,
  "capital": null,
  "manager_name": null,
  "manager_nationality": null,
  "manager_birth_date": null,
  "manager_national_id": null,
  "manager_id_issue_date": null,
  "manager_id_issue_place": null,
  "manager_authority": null,
  "registry_office_city": null,
  "registered_in_city": null,
  "certificate_expiry_date": null,
  "receipt_number": null,
  "receipt_date": null,
  "registry_manager": null
}

If a field is missing, set its value to null.
"""

url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
headers = {"Content-Type": "application/json"}

def split_address(address):
    parts = [p.strip() for p in address.split("،")]
    while len(parts) < 4:
        parts.append(None)
    return parts

for image_name in os.listdir(cr1_images_folder):
    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue

    image_path = os.path.join(cr1_images_folder, image_name)
    base_name = os.path.splitext(image_name)[0]
    output_file = os.path.join(output_json_folder, base_name + ".json")

    if os.path.exists(output_file):
        print(f"Skipped {image_name} (JSON file already exists)")
        continue

    with open(image_path, "rb") as f:
        image_b64 = base64.b64encode(f.read()).decode()

    data = {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {
                        "inline_data": {
                            "mime_type": "image/jpeg",
                            "data": image_b64
                        }
                    }
                ]
            }
        ]
    }

    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']

        # استخدام التعبير المنتظم الصحيح
        match = re.search(r"```json\s*(\{.*?\})\s*```", response_text, re.DOTALL)
        if match:
            json_text = match.group(1)
            result = json.loads(json_text)

            # تقسيم العنوان إلى أجزاء منفصلة
            if "مركزها الرئيسي" in result and result["مركزها الرئيسي"]:
                parts = split_address(result["مركزها الرئيسي"])
                result["رقم المبنى"] = parts[0]
                result["اسم الشارع"] = parts[1]
                result["اسم الحي"] = parts[2]
                result["الرقم الإضافي"] = parts[3]
            else:
                result["رقم المبنى"] = None
                result["اسم الشارع"] = None
                result["اسم الحي"] = None
                result["الرقم الإضافي"] = None

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False, indent=2)

            print(f"✅ Processed: {image_name}")

        else:
            print(f"❌ Failed to extract JSON from: {image_name}")
            print(response_text)

        time.sleep(3)  # ينتظر 3 ثواني قبل إرسال الصورة التالية

    except Exception as e:
        print(f"❌ Error processing image {image_name}: {e}")