| import base64 |
| import json |
| import re |
| import requests |
| import os |
|
|
| |
| API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE" |
|
|
| |
| cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6" |
|
|
| |
| output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6\cr6_json" |
|
|
| |
| os.makedirs(output_json_folder, exist_ok=True) |
|
|
| |
| |
| prompt = """ |
| Please extract the following fields from the CR6 commercial registration document image, all in Arabic only: |
| |
| - الكيان التجاري |
| - حالة السجل |
| - مدة المنشأة |
| - الرقم الوطني الموحد للمنشأة |
| - رابط المتجر الإكتروني |
| - رأس المال |
| - المدينة |
| - صندوق البريد |
| - الرمز البريدي |
| - هاتف |
| - تاريخ اصدار السجل |
| - تاريخ انتهاء السجل |
| - الموقع الاكتروني |
| - العنوان |
| - النشاط التجاري |
| |
| Please return the result as JSON with the following exact keys, and if any field is missing, set its value to null: |
| |
| { |
| "commercial_entity": null, |
| "registry_status": null, |
| "establishment_duration": null, |
| "unified_national_number": null, |
| "online_store_link": null, |
| "capital": null, |
| "city": null, |
| "po_box": null, |
| "postal_code": null, |
| "phone": null, |
| "registry_issue_date": null, |
| "registry_expiry_date": null, |
| "website": null, |
| "address": null, |
| "business_activity": null, |
| "building_number": null, |
| "additional_number": null, |
| "unit_number": null |
| } |
| """ |
|
|
|
|
| url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}" |
| headers = {"Content-Type": "application/json"} |
|
|
| |
| for image_name in os.listdir(cr1_images_folder): |
| if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')): |
| continue |
|
|
| image_path = os.path.join(cr1_images_folder, image_name) |
| base_name = os.path.splitext(image_name)[0] |
| output_file = os.path.join(output_json_folder, base_name + ".json") |
|
|
| |
| if os.path.exists(output_file): |
| print(f"⏩ Skipped {image_name} (JSON file already exists)") |
| continue |
|
|
| |
| with open(image_path, "rb") as f: |
| image_b64 = base64.b64encode(f.read()).decode() |
|
|
| |
| data = { |
| "contents": [ |
| { |
| "role": "user", |
| "parts": [ |
| {"text": prompt}, |
| { |
| "inline_data": { |
| "mime_type": "image/jpeg", |
| "data": image_b64 |
| } |
| } |
| ] |
| } |
| ] |
| } |
|
|
| try: |
| response = requests.post(url, headers=headers, json=data) |
| response.raise_for_status() |
| response_text = response.json()['candidates'][0]['content']['parts'][0]['text'] |
|
|
| match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL) |
| if match: |
| json_text = match.group(1) |
| result = json.loads(json_text) |
|
|
| |
| address = result.get("العنوان") |
| if address: |
| parts = address.strip().split() |
| if len(parts) == 5: |
| result["رقم المبنى"] = parts[0] |
| result["المدينة"] = parts[1] |
| result["الرمز البريدي"] = parts[2] |
| result["الرقم الإضافي"] = parts[3] |
| result["رقم الوحدة"] = parts[4] |
| else: |
| result["رقم المبنى"] = None |
| result["المدينة"] = None |
| result["الرمز البريدي"] = None |
| result["الرقم الإضافي"] = None |
| result["رقم الوحدة"] = None |
|
|
| with open(output_file, "w", encoding="utf-8") as f: |
| json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
| print(f"✅ Processed: {image_name}") |
| else: |
| print(f"❌ Failed to extract JSON from: {image_name}") |
| print(response_text) |
|
|
| except Exception as e: |
| print(f"❌ Error processing image {image_name}: {e}") |
|
|