binder-sa
/

OCR-pipeline-python

Model card Files Files and versions

xet

Community

abdullah-1111 commited on Aug 7, 2025

Commit

81fdfcf

verified ·

1 Parent(s): 954546e

Update json-CR1.py

Browse files

Files changed (1) hide show

json-CR1.py +145 -143

json-CR1.py CHANGED Viewed

@@ -1,143 +1,145 @@
-import base64
-import json
-import re
-import requests
-import os
-# مفتاح Gemini API
-API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
-# المجلد الذي يحتوي صور cr1
-cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1"
-# مجلد إخراج ملفات JSON
-output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json"
-# Ensure output folder exists
-os.makedirs(output_json_folder, exist_ok=True)
-# Exact same prompt
-prompt = """
-Please extract the following fields from the CR1 commercial registration document image, all in Arabic only:
-- Unified number
-- Establishment number
-- Commercial name of the institution
-- Its main center
-- Phone
-- Postal code
-- Trader name
-- Nationality
-- Date of birth
-- Civil registry number - residence
-- Its date
-- Its source
-- Passport number
-- Its date
-- Issuer
-- Activity
-- Capital
-- Name of the manager or authorized agent
-- Nationality
-- Date of birth
-- Civil registry number - residence
-- Its date
-- Its source
-- Certified by the commercial registration office in the city
-- That the above-mentioned institution is registered in the city
-- Certificates expire on
-- Receipt number
-- Date
-Please return the result as JSON with these exact keys only, and if any field is missing, set its value to null:
-{
-  "رقم الموحد": null,
-  "رقم المنشأة": null,
-  "الاسم التجاري للمؤسسة": null,
-  "مركزها الرئيسي": null,
-  "هاتف": null,
-  "الرمز البريدي": null,
-  "اسم التاجر": null,
-  "الجنسية": null,
-  "تاريخ الميلاد": null,
-  "رقم السجل المدني-الإقامة": null,
-  "تاريخه": null,
-  "مصدره": null,
-  "رقم الحفيظة-الجواز": null,
-  "تاريخه_2": null,
-  "مصدرة": null,
-  "النشاط": null,
-  "رأس المال": null,
-  "اسم المدير أو الوكيل المفوض": null,
-  "الجنسية_2": null,
-  "تاريخ الميلاد_2": null,
-  "رقم السجل المدني-الإقامة_2": null,
-  "تاريخه_3": null,
-  "مصدره_2": null,
-  "يشهد مكتب السجل التجاري بمدينة": null,
-  "بأنه تم تسجيل المؤسسة المذكورة أعلاه بمدينة": null,
-  "تنتهي صلاحية الشهادات في": null,
-  "بموجب الإيصال رقم": null,
-  "تاريخ_الإيصال": null
-}
-"""
-url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
-headers = {"Content-Type": "application/json"}
-# Iterate over all images
-for image_name in os.listdir(cr1_images_folder):
-    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
-        continue
-    image_path = os.path.join(cr1_images_folder, image_name)
-    base_name = os.path.splitext(image_name)[0]
-    output_file = os.path.join(output_json_folder, base_name + ".json")
-    # Skip if JSON already exists
-    if os.path.exists(output_file):
-        print(f"Skipped {image_name} (JSON file already exists)")
-        continue
-    # Read image and convert to base64
-    with open(image_path, "rb") as f:
-        image_b64 = base64.b64encode(f.read()).decode()
-    # Send request to Gemini API
-    data = {
-        "contents": [
-            {
-                "role": "user",
-                "parts": [
-                    {"text": prompt},
-                    {
-                        "inline_data": {
-                            "mime_type": "image/jpeg",
-                            "data": image_b64
-                        }
-                    }
-                ]
-            }
-        ]
-    }
-    try:
-        response = requests.post(url, headers=headers, json=data)
-        response.raise_for_status()
-        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
-        match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
-        if match:
-            json_text = match.group(1)
-            result = json.loads(json_text)
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(result, f, ensure_ascii=False, indent=2)
-            print(f"✅ Processed: {image_name}")
-        else:
-            print(f"❌ Failed to extract JSON from: {image_name}")
-            print(response_text)
-    except Exception as e:
-        print(f"❌ Error processing image {image_name}: {e}")

+import base64
+import json
+import re
+import requests
+import os
+# مفتاح Gemini API
+API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
+# المجلد الذي يحتوي صور cr1
+cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1"
+# مجلد إخراج ملفات JSON
+output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json"
+# Ensure output folder exists
+os.makedirs(output_json_folder, exist_ok=True)
+# Exact same prompt
+prompt = """
+Please extract the following fields from the CR1 commercial registration document image. Extract the Arabic text, but return the output as JSON using the exact English keys below. If any field is missing, set its value to null.
+Fields to extract (in Arabic content):
+- Unified number
+- Establishment number
+- Commercial name of the institution
+- Its main center
+- Phone
+- Postal code
+- Trader name
+- Nationality
+- Date of birth
+- Civil registry number - residence
+- Its date
+- Its source
+- Passport number
+- Its date
+- Issuer
+- Activity
+- Capital
+- Name of the manager or authorized agent
+- Nationality
+- Date of birth
+- Civil registry number - residence
+- Its date
+- Its source
+- Certified by the commercial registration office in the city
+- That the above-mentioned institution is registered in the city
+- Certificates expire on
+- Receipt number
+- Date
+Return the output in JSON format with these exact keys:
+{
+  "unified_number": null,
+  "establishment_number": null,
+  "institution_name": null,
+  "main_office": null,
+  "phone": null,
+  "postal_code": null,
+  "merchant_name": null,
+  "nationality": null,
+  "birth_date": null,
+  "national_id": null,
+  "national_id_issue_date": null,
+  "national_id_issue_place": null,
+  "passport_number": null,
+  "passport_issue_date": null,
+  "passport_issuer": null,
+  "business_activity": null,
+  "capital": null,
+  "manager_name": null,
+  "manager_nationality": null,
+  "manager_birth_date": null,
+  "manager_national_id": null,
+  "manager_id_issue_date": null,
+  "manager_id_issue_place": null,
+  "registry_office_city": null,
+  "registered_in_city": null,
+  "certificate_expiry_date": null,
+  "receipt_number": null,
+  "receipt_date": null
+}
+"""
+url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
+headers = {"Content-Type": "application/json"}
+# Iterate over all images
+for image_name in os.listdir(cr1_images_folder):
+    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
+        continue
+    image_path = os.path.join(cr1_images_folder, image_name)
+    base_name = os.path.splitext(image_name)[0]
+    output_file = os.path.join(output_json_folder, base_name + ".json")
+    # Skip if JSON already exists
+    if os.path.exists(output_file):
+        print(f"Skipped {image_name} (JSON file already exists)")
+        continue
+    # Read image and convert to base64
+    with open(image_path, "rb") as f:
+        image_b64 = base64.b64encode(f.read()).decode()
+    # Send request to Gemini API
+    data = {
+        "contents": [
+            {
+                "role": "user",
+                "parts": [
+                    {"text": prompt},
+                    {
+                        "inline_data": {
+                            "mime_type": "image/jpeg",
+                            "data": image_b64
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
+        match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
+        if match:
+            json_text = match.group(1)
+            result = json.loads(json_text)
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            print(f"✅ Processed: {image_name}")
+        else:
+            print(f"❌ Failed to extract JSON from: {image_name}")
+            print(response_text)
+    except Exception as e:
+        print(f"❌ Error processing image {image_name}: {e}")