File size: 4,522 Bytes
b278f6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import base64
import json
import re
import requests
import os

# مفتاح Gemini API
API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"

# المجلد الذي يحتوي صور cr1
cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6"

# مجلد إخراج ملفات JSON
output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6\cr6_json"

# Ensure output folder exists
os.makedirs(output_json_folder, exist_ok=True)

# Exact same prompt
# Exact same prompt
prompt = """
Please extract the following fields from the CR6 commercial registration document image, all in Arabic only:

- الكيان التجاري
- حالة السجل
- مدة المنشأة
- الرقم الوطني الموحد للمنشأة
- رابط المتجر الإكتروني
- رأس المال
- المدينة
- صندوق البريد
- الرمز البريدي
- هاتف
- تاريخ اصدار السجل
- تاريخ انتهاء السجل
- الموقع الاكتروني
- العنوان
- النشاط التجاري

Please return the result as JSON with the following exact keys, and if any field is missing, set its value to null:

{
  "commercial_entity": null,
  "registry_status": null,
  "establishment_duration": null,
  "unified_national_number": null,
  "online_store_link": null,
  "capital": null,
  "city": null,
  "po_box": null,
  "postal_code": null,
  "phone": null,
  "registry_issue_date": null,
  "registry_expiry_date": null,
  "website": null,
  "address": null,
  "business_activity": null,
  "building_number": null,
  "additional_number": null,
  "unit_number": null
}
"""


url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
headers = {"Content-Type": "application/json"}

# Iterate over all images
for image_name in os.listdir(cr1_images_folder):
    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue

    image_path = os.path.join(cr1_images_folder, image_name)
    base_name = os.path.splitext(image_name)[0]
    output_file = os.path.join(output_json_folder, base_name + ".json")

    # Skip if JSON already exists
    if os.path.exists(output_file):
        print(f"⏩ Skipped {image_name} (JSON file already exists)")
        continue

    # Read image and convert to base64
    with open(image_path, "rb") as f:
        image_b64 = base64.b64encode(f.read()).decode()

    # Send request to Gemini API
    data = {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {
                        "inline_data": {
                            "mime_type": "image/jpeg",
                            "data": image_b64
                        }
                    }
                ]
            }
        ]
    }

    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']

        match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
        if match:
            json_text = match.group(1)
            result = json.loads(json_text)

            # تقسيم حقل العنوان إذا كان موجود
            address = result.get("العنوان")
            if address:
                parts = address.strip().split()
                if len(parts) == 5:
                    result["رقم المبنى"] = parts[0]
                    result["المدينة"] = parts[1]
                    result["الرمز البريدي"] = parts[2]
                    result["الرقم الإضافي"] = parts[3]
                    result["رقم الوحدة"] = parts[4]
                else:
                    result["رقم المبنى"] = None
                    result["المدينة"] = None
                    result["الرمز البريدي"] = None
                    result["الرقم الإضافي"] = None
                    result["رقم الوحدة"] = None

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False, indent=2)

            print(f"✅ Processed: {image_name}")
        else:
            print(f"❌ Failed to extract JSON from: {image_name}")
            print(response_text)

    except Exception as e:
        print(f"❌ Error processing image {image_name}: {e}")