| import base64 |
| import json |
| import re |
| import requests |
| import os |
| import time |
|
|
| API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE" |
|
|
| cr1_images_folder = r"C:\Users\ASUS\Downloads\downloaded_images\classified\CR5" |
| |
| output_json_folder = r"C:\Users\ASUS\Downloads\downloaded_images\classified\CR5\cr5_json" |
|
|
|
|
| os.makedirs(output_json_folder, exist_ok=True) |
|
|
| prompt = """ |
| Extract the following fields from the CR5 document image. Return both Arabic and English text where available: |
| |
| الرقم |
| التاريخ |
| الرقم الموحد للمنشأة |
| الاسم التجاري للشركة |
| نوعها |
| جنسيتها |
| مدة الشركة |
| تبدا من |
| تنتهي في |
| مركزها الرئيسي |
| هاتف |
| الرمز البريدي |
| النشاط |
| رأس المال |
| المديرون |
| سلطات المدير/المديرون |
| يشهد مكتب السجل التجاري بمدينة |
| بأنه تم تسجيل المؤسسة المذكورة أعلاة بمدينة |
| وتنتهي صلاحية الشهادات في |
| بموجب الإيصال رقم |
| وتاريخ |
| |
| Return as JSON with keys: |
| { |
| "document_number": ..., |
| "document_date": ..., |
| "unified_establishment_number": ..., |
| "company_name": ..., |
| "company_type": ..., |
| "nationality": ..., |
| "company_duration": ..., |
| "start_date": ..., |
| "end_date": ..., |
| "head_office": ..., |
| "phone": ..., |
| "postal_code": ..., |
| "business_activity": ..., |
| "capital": ..., |
| "managers": ..., |
| "manager_authority": ..., |
| "registry_office_city": ..., |
| "registered_in_city": ..., |
| "certificate_expiry_date": ..., |
| "receipt_number": ..., |
| "receipt_date": ... |
| } |
| |
| If a field is missing, set it to null. |
| """ |
|
|
| url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}" |
| headers = {"Content-Type": "application/json"} |
|
|
| def split_address(address): |
| parts = [p.strip() for p in address.split("،")] |
| while len(parts) < 4: |
| parts.append(None) |
| return parts |
|
|
| for image_name in os.listdir(cr1_images_folder): |
| if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')): |
| continue |
|
|
| image_path = os.path.join(cr1_images_folder, image_name) |
| base_name = os.path.splitext(image_name)[0] |
| output_file = os.path.join(output_json_folder, base_name + ".json") |
|
|
| if os.path.exists(output_file): |
| print(f"Skipped {image_name} (JSON file already exists)") |
| continue |
|
|
| with open(image_path, "rb") as f: |
| image_b64 = base64.b64encode(f.read()).decode() |
|
|
| data = { |
| "contents": [ |
| { |
| "role": "user", |
| "parts": [ |
| {"text": prompt}, |
| { |
| "inline_data": { |
| "mime_type": "image/jpeg", |
| "data": image_b64 |
| } |
| } |
| ] |
| } |
| ] |
| } |
|
|
| try: |
| response = requests.post(url, headers=headers, json=data) |
| response.raise_for_status() |
| response_text = response.json()['candidates'][0]['content']['parts'][0]['text'] |
|
|
| |
| match = re.search(r"```json\s*(\{.*?\})\s*```", response_text, re.DOTALL) |
| if match: |
| json_text = match.group(1) |
| result = json.loads(json_text) |
|
|
| |
| if "مركزها الرئيسي" in result and result["مركزها الرئيسي"]: |
| parts = split_address(result["مركزها الرئيسي"]) |
| result["رقم المبنى"] = parts[0] |
| result["اسم الشارع"] = parts[1] |
| result["اسم الحي"] = parts[2] |
| result["الرقم الإضافي"] = parts[3] |
| else: |
| result["رقم المبنى"] = None |
| result["اسم الشارع"] = None |
| result["اسم الحي"] = None |
| result["الرقم الإضافي"] = None |
|
|
| with open(output_file, "w", encoding="utf-8") as f: |
| json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
| print(f"✅ Processed: {image_name}") |
|
|
| else: |
| print(f"❌ Failed to extract JSON from: {image_name}") |
| print(response_text) |
|
|
| time.sleep(3) |
|
|
| except Exception as e: |
| print(f"❌ Error processing image {image_name}: {e}") |
|
|