abdullah-1111 commited on
Commit
da2b7d6
·
verified ·
1 Parent(s): a16c51d

Update json-V3.py

Browse files
Files changed (1) hide show
  1. json-V3.py +105 -104
json-V3.py CHANGED
@@ -1,104 +1,105 @@
1
- import base64
2
- import json
3
- import re
4
- import requests
5
- import os
6
-
7
- # مفتاح Gemini API
8
- API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
9
-
10
- # المجلد الذي يحتوي صور cr1
11
- cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1"
12
- # مجلد إخراج ملفات JSON
13
- output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json"
14
-
15
- # Ensure output folder exists
16
- os.makedirs(output_json_folder, exist_ok=True)
17
-
18
- # Exact same prompt
19
- prompt = """
20
- استخرج الحقول التالية من مستند باللغة العربية فقط، وأرجعها بصيغة JSON:
21
-
22
- 1. اسم المكلف
23
- 2. عنوان المركز الرئيسي
24
- 3. المدينة
25
- 4. الحي
26
- 5. صندوق البريد
27
- 6. الرمز البريدي
28
- 7. الهاتف
29
-
30
- الصيغة المطلوبة:
31
-
32
- {
33
- "اسم المكلف": "...",
34
- "عنوان المركز الرئيسي": "...",
35
- "المدينة": "...",
36
- "الحي": "...",
37
- "صندوق البريد": "...",
38
- "الرمز البريدي": "...",
39
- "الهاتف": "..."
40
- }
41
-
42
- إذا لم يوجد حقل، أرجعه كـ null.
43
- """
44
-
45
-
46
- url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
47
- headers = {"Content-Type": "application/json"}
48
-
49
- # Iterate over all images
50
- for image_name in os.listdir(cr1_images_folder):
51
- if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
52
- continue
53
-
54
- image_path = os.path.join(cr1_images_folder, image_name)
55
- base_name = os.path.splitext(image_name)[0]
56
- output_file = os.path.join(output_json_folder, base_name + ".json")
57
-
58
- # Skip if JSON already exists
59
- if os.path.exists(output_file):
60
- print(f"Skipped {image_name} (JSON file already exists)")
61
- continue
62
-
63
- # Read image and convert to base64
64
- with open(image_path, "rb") as f:
65
- image_b64 = base64.b64encode(f.read()).decode()
66
-
67
- # Send request to Gemini API
68
- data = {
69
- "contents": [
70
- {
71
- "role": "user",
72
- "parts": [
73
- {"text": prompt},
74
- {
75
- "inline_data": {
76
- "mime_type": "image/jpeg",
77
- "data": image_b64
78
- }
79
- }
80
- ]
81
- }
82
- ]
83
- }
84
-
85
- try:
86
- response = requests.post(url, headers=headers, json=data)
87
- response.raise_for_status()
88
- response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
89
-
90
- match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
91
- if match:
92
- json_text = match.group(1)
93
- result = json.loads(json_text)
94
-
95
- with open(output_file, "w", encoding="utf-8") as f:
96
- json.dump(result, f, ensure_ascii=False, indent=2)
97
-
98
- print(f"✅ Processed: {image_name}")
99
- else:
100
- print(f"❌ Failed to extract JSON from: {image_name}")
101
- print(response_text)
102
-
103
- except Exception as e:
104
- print(f"❌ Error processing image {image_name}: {e}")
 
 
1
+ import base64
2
+ import json
3
+ import re
4
+ import requests
5
+ import os
6
+
7
+ # مفتاح Gemini API
8
+ API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
9
+
10
+ # المجلد الذي يحتوي صور cr1
11
+ cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1"
12
+ # مجلد إخراج ملفات JSON
13
+ output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json"
14
+
15
+ # Ensure output folder exists
16
+ os.makedirs(output_json_folder, exist_ok=True)
17
+
18
+ # Exact same prompt
19
+ prompt = """
20
+ Extract the following fields from the document (in Arabic only) and return them in JSON format:
21
+
22
+ 1. اسم المكلف
23
+ 2. عنوان المركز الرئيسي
24
+ 3. المدينة
25
+ 4. الحي
26
+ 5. صندوق البريد
27
+ 6. الرمز البريدي
28
+ 7. الهاتف
29
+
30
+ The required JSON format:
31
+
32
+ {
33
+ "taxpayer_name": "...",
34
+ "headquarters_address": "...",
35
+ "city": "...",
36
+ "district": "...",
37
+ "po_box": "...",
38
+ "postal_code": "...",
39
+ "phone": "..."
40
+ }
41
+
42
+ If any field is missing, return it as null.
43
+ """
44
+
45
+
46
+
47
+ url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
48
+ headers = {"Content-Type": "application/json"}
49
+
50
+ # Iterate over all images
51
+ for image_name in os.listdir(cr1_images_folder):
52
+ if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
53
+ continue
54
+
55
+ image_path = os.path.join(cr1_images_folder, image_name)
56
+ base_name = os.path.splitext(image_name)[0]
57
+ output_file = os.path.join(output_json_folder, base_name + ".json")
58
+
59
+ # Skip if JSON already exists
60
+ if os.path.exists(output_file):
61
+ print(f"Skipped {image_name} (JSON file already exists)")
62
+ continue
63
+
64
+ # Read image and convert to base64
65
+ with open(image_path, "rb") as f:
66
+ image_b64 = base64.b64encode(f.read()).decode()
67
+
68
+ # Send request to Gemini API
69
+ data = {
70
+ "contents": [
71
+ {
72
+ "role": "user",
73
+ "parts": [
74
+ {"text": prompt},
75
+ {
76
+ "inline_data": {
77
+ "mime_type": "image/jpeg",
78
+ "data": image_b64
79
+ }
80
+ }
81
+ ]
82
+ }
83
+ ]
84
+ }
85
+
86
+ try:
87
+ response = requests.post(url, headers=headers, json=data)
88
+ response.raise_for_status()
89
+ response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
90
+
91
+ match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
92
+ if match:
93
+ json_text = match.group(1)
94
+ result = json.loads(json_text)
95
+
96
+ with open(output_file, "w", encoding="utf-8") as f:
97
+ json.dump(result, f, ensure_ascii=False, indent=2)
98
+
99
+ print(f"✅ Processed: {image_name}")
100
+ else:
101
+ print(f"❌ Failed to extract JSON from: {image_name}")
102
+ print(response_text)
103
+
104
+ except Exception as e:
105
+ print(f"❌ Error processing image {image_name}: {e}")