binder-sa
/

OCR-pipeline-python

Model card Files Files and versions

xet

Community

abdullah-1111 commited on Aug 7, 2025

Commit

358a08a

verified ·

1 Parent(s): 7c5647d

Update json-CR3.py

Browse files

Files changed (1) hide show

json-CR3.py +169 -165

json-CR3.py CHANGED Viewed

@@ -1,165 +1,169 @@
-import base64
-import json
-import re
-import requests
-import os
-import time
-API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
-cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR2"
-output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR2\cr2_json"
-os.makedirs(output_json_folder, exist_ok=True)
-prompt = """
-Extract the following fields from the CR3 document image. Return both Arabic and English text where available:
-الرقم
-التاريخ
-الرقم الموحد للمنشأة
-اسم التاجر
-الجنسية
-تاريخ الميلاد
-رقم السجل المدني-الإقامة
-تاريخه
-مصدرة
-مركزها الرئيسي
-هاتف
-الرمز البريدي
-رقم سجل المركز الرئيسي
-الاسم التجاري للفرع
-العنوان
-الرمز البريدي
-الهاتف
-النشاط
-رأس المال
-إسم المدير او الوكيل المفوض
-الجنسية
-تاريخ الميلاد
-رقم السجل المدني-الإقامة
-تاريخه
-مصدره
-سلطات المدير
-يشهد مكتب السجل التجاري بمدينة
-بأنه تم تسجيل المؤسسة المذكورة أعلاة بمدينة
-وتنتهي صلاحية الشهادات في
-بموجب الإيصال رقم
-وتاريخ
-مدير السجل التجاري
-Return as JSON with keys:
-{
-"الرقم": ...,
-"التاريخ": ...,
-"الرقم_الموحد_للمنشأة": ...,
-"اسم_التاجر": ...,
-"الجنسية": ...,
-"تاريخ_الميلاد": ...,
-"رقم_السجل_المدني_الإقامة": ...,
-"تاريخه": ...,
-"مصدرة": ...,
-"مركزها_الرئيسي": ...,
-"هاتف": ...,
-"الرمز_البريدي": ...,
-"رقم_سجل_المركز_الرئيسي": ...,
-"الاسم_التجاري_للفرع": ...,
-"العنوان": ...,
-"الرمز_البريدي_الفرع": ...,
-"الهاتف_الفرع": ...,
-"النشاط": ...,
-"رأس_المال": ...,
-"اسم_المدير_او_الوكيل_المفوض": ...,
-"الجنسية_المدير": ...,
-"تاريخ_ميلاد_المدير": ...,
-"رقم_السجل_المدني_الإقامة_المدير": ...,
-"تاريخه_المدير": ...,
-"مصدره_المدير": ...,
-"سلطات_المدير": ...,
-"يشهد_مكتب_السجل": ...,
-"تم_تسجيل_المؤسسة": ...,
-"تنتهي_صلاحية_الشهادة": ...,
-"الإيصال_رقم": ...,
-"الإيصال_تاريخ": ...,
-"مدير_السجل_التجاري": ...
-}
-If a field is missing, set it to null.
-"""
-url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
-headers = {"Content-Type": "application/json"}
-def split_address(address):
-    parts = [p.strip() for p in address.split("،")]
-    while len(parts) < 4:
-        parts.append(None)
-    return parts
-for image_name in os.listdir(cr1_images_folder):
-    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
-        continue
-    image_path = os.path.join(cr1_images_folder, image_name)
-    base_name = os.path.splitext(image_name)[0]
-    output_file = os.path.join(output_json_folder, base_name + ".json")
-    if os.path.exists(output_file):
-        print(f"Skipped {image_name} (JSON file already exists)")
-        continue
-    with open(image_path, "rb") as f:
-        image_b64 = base64.b64encode(f.read()).decode()
-    data = {
-        "contents": [
-            {
-                "role": "user",
-                "parts": [
-                    {"text": prompt},
-                    {
-                        "inline_data": {
-                            "mime_type": "image/jpeg",
-                            "data": image_b64
-                        }
-                    }
-                ]
-            }
-        ]
-    }
-    try:
-        response = requests.post(url, headers=headers, json=data)
-        response.raise_for_status()
-        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
-        # استخدام التعبير المنتظم الصحيح
-        match = re.search(r"```json\s*(\{.*?\})\s*```", response_text, re.DOTALL)
-        if match:
-            json_text = match.group(1)
-            result = json.loads(json_text)
-            # تقسيم العنوان إلى أجزاء منفصلة
-            if "مركزها الرئيسي" in result and result["مركزها الرئيسي"]:
-                parts = split_address(result["مركزها الرئيسي"])
-                result["رقم المبنى"] = parts[0]
-                result["اسم الشارع"] = parts[1]
-                result["اسم الحي"] = parts[2]
-                result["الرقم الإضافي"] = parts[3]
-            else:
-                result["رقم المبنى"] = None
-                result["اسم الشارع"] = None
-                result["اسم الحي"] = None
-                result["الرقم الإضافي"] = None
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(result, f, ensure_ascii=False, indent=2)
-            print(f"✅ Processed: {image_name}")
-        else:
-            print(f"❌ Failed to extract JSON from: {image_name}")
-            print(response_text)
-        time.sleep(3)  # ينتظر 3 ثواني قبل إرسال الصورة التالية
-    except Exception as e:
-        print(f"❌ Error processing image {image_name}: {e}")

+import base64
+import json
+import re
+import requests
+import os
+import time
+API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
+cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR2"
+output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR2\cr2_json"
+os.makedirs(output_json_folder, exist_ok=True)
+prompt = """
+Extract the following fields from the CR3 document image. Return both Arabic text and English text where available:
+Fields to extract (in Arabic):
+- الرقم
+- التاريخ
+- الرقم الموحد للمنشأة
+- اسم التاجر
+- الجنسية
+- تاريخ الميلاد
+- رقم السجل المدني-الإقامة
+- تاريخه
+- مصدرة
+- مركزها الرئيسي
+- هاتف
+- الرمز البريدي
+- رقم سجل المركز الرئيسي
+- الاسم التجاري للفرع
+- العنوان
+- الرمز البريدي
+- الهاتف
+- النشاط
+- رأس المال
+- إسم المدير او الوكيل المفوض
+- الجنسية
+- تاريخ الميلاد
+- رقم السجل المدني-الإقامة
+- تاريخه
+- مصدره
+- سلطات المدير
+- يشهد مكتب السجل التجاري بمدينة
+- بأنه تم تسجيل المؤسسة المذكورة أعلاة بمدينة
+- وتنتهي صلاحية الشهادات في
+- بموجب الإيصال رقم
+- وتاريخ
+- مدير السجل التجاري
+Return the result as JSON with the following keys in English:
+{
+  "document_number": null,
+  "document_date": null,
+  "unified_establishment_number": null,
+  "merchant_name": null,
+  "nationality": null,
+  "birth_date": null,
+  "national_id": null,
+  "national_id_issue_date": null,
+  "national_id_issue_place": null,
+  "main_office": null,
+  "phone": null,
+  "postal_code": null,
+  "main_office_registry_number": null,
+  "branch_commercial_name": null,
+  "branch_address": null,
+  "branch_postal_code": null,
+  "branch_phone": null,
+  "business_activity": null,
+  "capital": null,
+  "manager_name": null,
+  "manager_nationality": null,
+  "manager_birth_date": null,
+  "manager_national_id": null,
+  "manager_id_issue_date": null,
+  "manager_id_issue_place": null,
+  "manager_authority": null,
+  "registry_office_city": null,
+  "registered_in_city": null,
+  "certificate_expiry_date": null,
+  "receipt_number": null,
+  "receipt_date": null,
+  "registry_manager": null
+}
+If a field is missing, set its value to null.
+"""
+url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
+headers = {"Content-Type": "application/json"}
+def split_address(address):
+    parts = [p.strip() for p in address.split("،")]
+    while len(parts) < 4:
+        parts.append(None)
+    return parts
+for image_name in os.listdir(cr1_images_folder):
+    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
+        continue
+    image_path = os.path.join(cr1_images_folder, image_name)
+    base_name = os.path.splitext(image_name)[0]
+    output_file = os.path.join(output_json_folder, base_name + ".json")
+    if os.path.exists(output_file):
+        print(f"Skipped {image_name} (JSON file already exists)")
+        continue
+    with open(image_path, "rb") as f:
+        image_b64 = base64.b64encode(f.read()).decode()
+    data = {
+        "contents": [
+            {
+                "role": "user",
+                "parts": [
+                    {"text": prompt},
+                    {
+                        "inline_data": {
+                            "mime_type": "image/jpeg",
+                            "data": image_b64
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
+        # استخدام التعبير المنتظم الصحيح
+        match = re.search(r"```json\s*(\{.*?\})\s*```", response_text, re.DOTALL)
+        if match:
+            json_text = match.group(1)
+            result = json.loads(json_text)
+            # تقسيم العنوان إلى أجزاء منفصلة
+            if "مركزها الرئيسي" in result and result["مركزها الرئيسي"]:
+                parts = split_address(result["مركزها الرئيسي"])
+                result["رقم المبنى"] = parts[0]
+                result["اسم الشارع"] = parts[1]
+                result["اسم الحي"] = parts[2]
+                result["الرقم الإضافي"] = parts[3]
+            else:
+                result["رقم المبنى"] = None
+                result["اسم الشارع"] = None
+                result["اسم الحي"] = None
+                result["الرقم الإضافي"] = None
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            print(f"✅ Processed: {image_name}")
+        else:
+            print(f"❌ Failed to extract JSON from: {image_name}")
+            print(response_text)
+        time.sleep(3)  # ينتظر 3 ثواني قبل إرسال الصورة التالية
+    except Exception as e:
+        print(f"❌ Error processing image {image_name}: {e}")