| import base64 |
| import json |
| import re |
| import requests |
| import os |
|
|
| |
| API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE" |
|
|
| |
| cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1" |
| |
| output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR1\cr1_json" |
|
|
| |
| os.makedirs(output_json_folder, exist_ok=True) |
|
|
| |
| prompt = """ |
| Please extract the following fields from the CR1 commercial registration document image. Extract the Arabic text, but return the output as JSON using the exact English keys below. If any field is missing, set its value to null. |
| |
| Fields to extract (in Arabic content): |
| - Unified number |
| - Establishment number |
| - Commercial name of the institution |
| - Its main center |
| - Phone |
| - Postal code |
| - Trader name |
| - Nationality |
| - Date of birth |
| - Civil registry number - residence |
| - Its date |
| - Its source |
| - Passport number |
| - Its date |
| - Issuer |
| - Activity |
| - Capital |
| - Name of the manager or authorized agent |
| - Nationality |
| - Date of birth |
| - Civil registry number - residence |
| - Its date |
| - Its source |
| - Certified by the commercial registration office in the city |
| - That the above-mentioned institution is registered in the city |
| - Certificates expire on |
| - Receipt number |
| - Date |
| |
| Return the output in JSON format with these exact keys: |
| |
| { |
| "unified_number": null, |
| "establishment_number": null, |
| "institution_name": null, |
| "main_office": null, |
| "phone": null, |
| "postal_code": null, |
| "merchant_name": null, |
| "nationality": null, |
| "birth_date": null, |
| "national_id": null, |
| "national_id_issue_date": null, |
| "national_id_issue_place": null, |
| "passport_number": null, |
| "passport_issue_date": null, |
| "passport_issuer": null, |
| "business_activity": null, |
| "capital": null, |
| "manager_name": null, |
| "manager_nationality": null, |
| "manager_birth_date": null, |
| "manager_national_id": null, |
| "manager_id_issue_date": null, |
| "manager_id_issue_place": null, |
| "registry_office_city": null, |
| "registered_in_city": null, |
| "certificate_expiry_date": null, |
| "receipt_number": null, |
| "receipt_date": null |
| } |
| """ |
|
|
|
|
| url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}" |
| headers = {"Content-Type": "application/json"} |
|
|
| |
| for image_name in os.listdir(cr1_images_folder): |
| if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')): |
| continue |
|
|
| image_path = os.path.join(cr1_images_folder, image_name) |
| base_name = os.path.splitext(image_name)[0] |
| output_file = os.path.join(output_json_folder, base_name + ".json") |
|
|
| |
| if os.path.exists(output_file): |
| print(f"Skipped {image_name} (JSON file already exists)") |
| continue |
|
|
| |
| with open(image_path, "rb") as f: |
| image_b64 = base64.b64encode(f.read()).decode() |
|
|
| |
| data = { |
| "contents": [ |
| { |
| "role": "user", |
| "parts": [ |
| {"text": prompt}, |
| { |
| "inline_data": { |
| "mime_type": "image/jpeg", |
| "data": image_b64 |
| } |
| } |
| ] |
| } |
| ] |
| } |
|
|
| try: |
| response = requests.post(url, headers=headers, json=data) |
| response.raise_for_status() |
| response_text = response.json()['candidates'][0]['content']['parts'][0]['text'] |
|
|
| match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL) |
| if match: |
| json_text = match.group(1) |
| result = json.loads(json_text) |
|
|
| with open(output_file, "w", encoding="utf-8") as f: |
| json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
| print(f"✅ Processed: {image_name}") |
| else: |
| print(f"❌ Failed to extract JSON from: {image_name}") |
| print(response_text) |
|
|
| except Exception as e: |
| print(f"❌ Error processing image {image_name}: {e}") |
|
|