Update json-CR6.py

b278f6d verified 8 months ago

4.52 kB

	import base64
	import json
	import re
	import requests
	import os

	# مفتاح Gemini API
	API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"

	# المجلد الذي يحتوي صور cr1
	cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6"

	# مجلد إخراج ملفات JSON
	output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6\cr6_json"

	# Ensure output folder exists
	os.makedirs(output_json_folder, exist_ok=True)

	# Exact same prompt
	# Exact same prompt
	prompt = """
	Please extract the following fields from the CR6 commercial registration document image, all in Arabic only:

	- الكيان التجاري
	- حالة السجل
	- مدة المنشأة
	- الرقم الوطني الموحد للمنشأة
	- رابط المتجر الإكتروني
	- رأس المال
	- المدينة
	- صندوق البريد
	- الرمز البريدي
	- هاتف
	- تاريخ اصدار السجل
	- تاريخ انتهاء السجل
	- الموقع الاكتروني
	- العنوان
	- النشاط التجاري

	Please return the result as JSON with the following exact keys, and if any field is missing, set its value to null:

	{
	"commercial_entity": null,
	"registry_status": null,
	"establishment_duration": null,
	"unified_national_number": null,
	"online_store_link": null,
	"capital": null,
	"city": null,
	"po_box": null,
	"postal_code": null,
	"phone": null,
	"registry_issue_date": null,
	"registry_expiry_date": null,
	"website": null,
	"address": null,
	"business_activity": null,
	"building_number": null,
	"additional_number": null,
	"unit_number": null
	}
	"""


	url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
	headers = {"Content-Type": "application/json"}

	# Iterate over all images
	for image_name in os.listdir(cr1_images_folder):
	if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
	continue

	image_path = os.path.join(cr1_images_folder, image_name)
	base_name = os.path.splitext(image_name)[0]
	output_file = os.path.join(output_json_folder, base_name + ".json")

	# Skip if JSON already exists
	if os.path.exists(output_file):
	print(f"⏩ Skipped {image_name} (JSON file already exists)")
	continue

	# Read image and convert to base64
	with open(image_path, "rb") as f:
	image_b64 = base64.b64encode(f.read()).decode()

	# Send request to Gemini API
	data = {
	"contents": [
	{
	"role": "user",
	"parts": [
	{"text": prompt},
	{
	"inline_data": {
	"mime_type": "image/jpeg",
	"data": image_b64
	}
	}
	]
	}
	]
	}

	try:
	response = requests.post(url, headers=headers, json=data)
	response.raise_for_status()
	response_text = response.json()['candidates'][0]['content']['parts'][0]['text']

	match = re.search(r"```json\s(\{.\})\s*```", response_text, re.DOTALL)
	if match:
	json_text = match.group(1)
	result = json.loads(json_text)

	# تقسيم حقل العنوان إذا كان موجود
	address = result.get("العنوان")
	if address:
	parts = address.strip().split()
	if len(parts) == 5:
	result["رقم المبنى"] = parts[0]
	result["المدينة"] = parts[1]
	result["الرمز البريدي"] = parts[2]
	result["الرقم الإضافي"] = parts[3]
	result["رقم الوحدة"] = parts[4]
	else:
	result["رقم المبنى"] = None
	result["المدينة"] = None
	result["الرمز البريدي"] = None
	result["الرقم الإضافي"] = None
	result["رقم الوحدة"] = None

	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(result, f, ensure_ascii=False, indent=2)

	print(f"✅ Processed: {image_name}")
	else:
	print(f"❌ Failed to extract JSON from: {image_name}")
	print(response_text)

	except Exception as e:
	print(f"❌ Error processing image {image_name}: {e}")