abdullah-1111 commited on
Commit
81fdfcf
·
verified ·
1 Parent(s): 954546e

Update json-CR1.py

Browse files
Files changed (1) hide show
  1. json-CR1.py +145 -143
json-CR1.py CHANGED
@@ -1,143 +1,145 @@
1
- import base64
2
- import json
3
- import re
4
- import requests
5
- import os
6
-
7
- # مفتاح Gemini API
8
- API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
9
-
10
- # المجلد الذي يحتوي صور cr1
11
- cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1"
12
- # مجلد إخراج ملفات JSON
13
- output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json"
14
-
15
- # Ensure output folder exists
16
- os.makedirs(output_json_folder, exist_ok=True)
17
-
18
- # Exact same prompt
19
- prompt = """
20
- Please extract the following fields from the CR1 commercial registration document image, all in Arabic only:
21
-
22
- - Unified number
23
- - Establishment number
24
- - Commercial name of the institution
25
- - Its main center
26
- - Phone
27
- - Postal code
28
- - Trader name
29
- - Nationality
30
- - Date of birth
31
- - Civil registry number - residence
32
- - Its date
33
- - Its source
34
- - Passport number
35
- - Its date
36
- - Issuer
37
- - Activity
38
- - Capital
39
- - Name of the manager or authorized agent
40
- - Nationality
41
- - Date of birth
42
- - Civil registry number - residence
43
- - Its date
44
- - Its source
45
- - Certified by the commercial registration office in the city
46
- - That the above-mentioned institution is registered in the city
47
- - Certificates expire on
48
- - Receipt number
49
- - Date
50
-
51
- Please return the result as JSON with these exact keys only, and if any field is missing, set its value to null:
52
-
53
- {
54
- "رقم الموحد": null,
55
- "رقم المنشأة": null,
56
- "الاسم التجاري للمؤسسة": null,
57
- "مركزها الرئيسي": null,
58
- "هاتف": null,
59
- "الرمز البريدي": null,
60
- "اسم التاجر": null,
61
- "الجنسية": null,
62
- "تاريخ الميلاد": null,
63
- "رقم السجل المدني-الإقامة": null,
64
- "تاريخه": null,
65
- "مصدره": null,
66
- "رقم الحفيظة-الجواز": null,
67
- "تاريخه_2": null,
68
- "مصدرة": null,
69
- "النشاط": null,
70
- "رأس المال": null,
71
- "اسم المدير أو الوكيل المفوض": null,
72
- "الجنسية_2": null,
73
- "تاريخ الميلاد_2": null,
74
- "رقم السجل المدني-الإقامة_2": null,
75
- "تاريخه_3": null,
76
- "مصدره_2": null,
77
- "يشهد مكتب السجل التجاري بمدينة": null,
78
- "بأنه تم تسجيل المؤسسة المذكورة أعلاه بمدينة": null,
79
- "تنتهي صلاحية الشهادات في": null,
80
- "بموجب الإيصال رقم": null,
81
- "تاريخ_الإيصال": null
82
- }
83
- """
84
-
85
- url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
86
- headers = {"Content-Type": "application/json"}
87
-
88
- # Iterate over all images
89
- for image_name in os.listdir(cr1_images_folder):
90
- if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
91
- continue
92
-
93
- image_path = os.path.join(cr1_images_folder, image_name)
94
- base_name = os.path.splitext(image_name)[0]
95
- output_file = os.path.join(output_json_folder, base_name + ".json")
96
-
97
- # Skip if JSON already exists
98
- if os.path.exists(output_file):
99
- print(f"Skipped {image_name} (JSON file already exists)")
100
- continue
101
-
102
- # Read image and convert to base64
103
- with open(image_path, "rb") as f:
104
- image_b64 = base64.b64encode(f.read()).decode()
105
-
106
- # Send request to Gemini API
107
- data = {
108
- "contents": [
109
- {
110
- "role": "user",
111
- "parts": [
112
- {"text": prompt},
113
- {
114
- "inline_data": {
115
- "mime_type": "image/jpeg",
116
- "data": image_b64
117
- }
118
- }
119
- ]
120
- }
121
- ]
122
- }
123
-
124
- try:
125
- response = requests.post(url, headers=headers, json=data)
126
- response.raise_for_status()
127
- response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
128
-
129
- match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
130
- if match:
131
- json_text = match.group(1)
132
- result = json.loads(json_text)
133
-
134
- with open(output_file, "w", encoding="utf-8") as f:
135
- json.dump(result, f, ensure_ascii=False, indent=2)
136
-
137
- print(f"✅ Processed: {image_name}")
138
- else:
139
- print(f" Failed to extract JSON from: {image_name}")
140
- print(response_text)
141
-
142
- except Exception as e:
143
- print(f"❌ Error processing image {image_name}: {e}")
 
 
 
1
+ import base64
2
+ import json
3
+ import re
4
+ import requests
5
+ import os
6
+
7
+ # مفتاح Gemini API
8
+ API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
9
+
10
+ # المجلد الذي يحتوي صور cr1
11
+ cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1"
12
+ # مجلد إخراج ملفات JSON
13
+ output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json"
14
+
15
+ # Ensure output folder exists
16
+ os.makedirs(output_json_folder, exist_ok=True)
17
+
18
+ # Exact same prompt
19
+ prompt = """
20
+ Please extract the following fields from the CR1 commercial registration document image. Extract the Arabic text, but return the output as JSON using the exact English keys below. If any field is missing, set its value to null.
21
+
22
+ Fields to extract (in Arabic content):
23
+ - Unified number
24
+ - Establishment number
25
+ - Commercial name of the institution
26
+ - Its main center
27
+ - Phone
28
+ - Postal code
29
+ - Trader name
30
+ - Nationality
31
+ - Date of birth
32
+ - Civil registry number - residence
33
+ - Its date
34
+ - Its source
35
+ - Passport number
36
+ - Its date
37
+ - Issuer
38
+ - Activity
39
+ - Capital
40
+ - Name of the manager or authorized agent
41
+ - Nationality
42
+ - Date of birth
43
+ - Civil registry number - residence
44
+ - Its date
45
+ - Its source
46
+ - Certified by the commercial registration office in the city
47
+ - That the above-mentioned institution is registered in the city
48
+ - Certificates expire on
49
+ - Receipt number
50
+ - Date
51
+
52
+ Return the output in JSON format with these exact keys:
53
+
54
+ {
55
+ "unified_number": null,
56
+ "establishment_number": null,
57
+ "institution_name": null,
58
+ "main_office": null,
59
+ "phone": null,
60
+ "postal_code": null,
61
+ "merchant_name": null,
62
+ "nationality": null,
63
+ "birth_date": null,
64
+ "national_id": null,
65
+ "national_id_issue_date": null,
66
+ "national_id_issue_place": null,
67
+ "passport_number": null,
68
+ "passport_issue_date": null,
69
+ "passport_issuer": null,
70
+ "business_activity": null,
71
+ "capital": null,
72
+ "manager_name": null,
73
+ "manager_nationality": null,
74
+ "manager_birth_date": null,
75
+ "manager_national_id": null,
76
+ "manager_id_issue_date": null,
77
+ "manager_id_issue_place": null,
78
+ "registry_office_city": null,
79
+ "registered_in_city": null,
80
+ "certificate_expiry_date": null,
81
+ "receipt_number": null,
82
+ "receipt_date": null
83
+ }
84
+ """
85
+
86
+
87
+ url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
88
+ headers = {"Content-Type": "application/json"}
89
+
90
+ # Iterate over all images
91
+ for image_name in os.listdir(cr1_images_folder):
92
+ if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
93
+ continue
94
+
95
+ image_path = os.path.join(cr1_images_folder, image_name)
96
+ base_name = os.path.splitext(image_name)[0]
97
+ output_file = os.path.join(output_json_folder, base_name + ".json")
98
+
99
+ # Skip if JSON already exists
100
+ if os.path.exists(output_file):
101
+ print(f"Skipped {image_name} (JSON file already exists)")
102
+ continue
103
+
104
+ # Read image and convert to base64
105
+ with open(image_path, "rb") as f:
106
+ image_b64 = base64.b64encode(f.read()).decode()
107
+
108
+ # Send request to Gemini API
109
+ data = {
110
+ "contents": [
111
+ {
112
+ "role": "user",
113
+ "parts": [
114
+ {"text": prompt},
115
+ {
116
+ "inline_data": {
117
+ "mime_type": "image/jpeg",
118
+ "data": image_b64
119
+ }
120
+ }
121
+ ]
122
+ }
123
+ ]
124
+ }
125
+
126
+ try:
127
+ response = requests.post(url, headers=headers, json=data)
128
+ response.raise_for_status()
129
+ response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
130
+
131
+ match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
132
+ if match:
133
+ json_text = match.group(1)
134
+ result = json.loads(json_text)
135
+
136
+ with open(output_file, "w", encoding="utf-8") as f:
137
+ json.dump(result, f, ensure_ascii=False, indent=2)
138
+
139
+ print(f" Processed: {image_name}")
140
+ else:
141
+ print(f"❌ Failed to extract JSON from: {image_name}")
142
+ print(response_text)
143
+
144
+ except Exception as e:
145
+ print(f"❌ Error processing image {image_name}: {e}")