File size: 4,522 Bytes
b278f6d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import base64
import json
import re
import requests
import os
# مفتاح Gemini API
API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
# المجلد الذي يحتوي صور cr1
cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6"
# مجلد إخراج ملفات JSON
output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6\cr6_json"
# Ensure output folder exists
os.makedirs(output_json_folder, exist_ok=True)
# Exact same prompt
# Exact same prompt
prompt = """
Please extract the following fields from the CR6 commercial registration document image, all in Arabic only:
- الكيان التجاري
- حالة السجل
- مدة المنشأة
- الرقم الوطني الموحد للمنشأة
- رابط المتجر الإكتروني
- رأس المال
- المدينة
- صندوق البريد
- الرمز البريدي
- هاتف
- تاريخ اصدار السجل
- تاريخ انتهاء السجل
- الموقع الاكتروني
- العنوان
- النشاط التجاري
Please return the result as JSON with the following exact keys, and if any field is missing, set its value to null:
{
"commercial_entity": null,
"registry_status": null,
"establishment_duration": null,
"unified_national_number": null,
"online_store_link": null,
"capital": null,
"city": null,
"po_box": null,
"postal_code": null,
"phone": null,
"registry_issue_date": null,
"registry_expiry_date": null,
"website": null,
"address": null,
"business_activity": null,
"building_number": null,
"additional_number": null,
"unit_number": null
}
"""
url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
headers = {"Content-Type": "application/json"}
# Iterate over all images
for image_name in os.listdir(cr1_images_folder):
if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
continue
image_path = os.path.join(cr1_images_folder, image_name)
base_name = os.path.splitext(image_name)[0]
output_file = os.path.join(output_json_folder, base_name + ".json")
# Skip if JSON already exists
if os.path.exists(output_file):
print(f"⏩ Skipped {image_name} (JSON file already exists)")
continue
# Read image and convert to base64
with open(image_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode()
# Send request to Gemini API
data = {
"contents": [
{
"role": "user",
"parts": [
{"text": prompt},
{
"inline_data": {
"mime_type": "image/jpeg",
"data": image_b64
}
}
]
}
]
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
if match:
json_text = match.group(1)
result = json.loads(json_text)
# تقسيم حقل العنوان إذا كان موجود
address = result.get("العنوان")
if address:
parts = address.strip().split()
if len(parts) == 5:
result["رقم المبنى"] = parts[0]
result["المدينة"] = parts[1]
result["الرمز البريدي"] = parts[2]
result["الرقم الإضافي"] = parts[3]
result["رقم الوحدة"] = parts[4]
else:
result["رقم المبنى"] = None
result["المدينة"] = None
result["الرمز البريدي"] = None
result["الرقم الإضافي"] = None
result["رقم الوحدة"] = None
with open(output_file, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"✅ Processed: {image_name}")
else:
print(f"❌ Failed to extract JSON from: {image_name}")
print(response_text)
except Exception as e:
print(f"❌ Error processing image {image_name}: {e}")
|