Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import base64 | |
| import re | |
| import numpy as np | |
| from PIL import Image | |
| from fastapi import FastAPI, Request | |
| import json | |
| import uvicorn | |
| try: | |
| import pytesseract | |
| TESSERACT_AVAILABLE = True | |
| except ImportError: | |
| TESSERACT_AVAILABLE = False | |
| print("⚠️ Pytesseract tidak ditemukan. Menggunakan OCR fallback.") | |
| # Fungsi untuk memproses string base64 menjadi gambar | |
| def process_base64_image(base64_string): | |
| try: | |
| # Jika string dimulai dengan 'data:image', hapus header | |
| if 'data:image' in base64_string: | |
| base64_string = base64_string.split(',')[1] | |
| # Decode base64 menjadi bytes | |
| image_bytes = base64.b64decode(base64_string) | |
| # Konversi bytes menjadi gambar PIL | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| return image | |
| except Exception as e: | |
| print(f"Error processing base64 image: {e}") | |
| return None | |
| # Fungsi untuk pembersihan teks hasil OCR | |
| def clean_ocr_text(text): | |
| # Hapus karakter yang tidak perlu | |
| text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) | |
| # Hapus spasi berlebih | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Gabungkan baris yang terpisah | |
| text = re.sub(r'(\w+)- *\n *(\w+)', r'\1\2', text) | |
| return text | |
| # Fungsi untuk validasi nilai field | |
| def validate_field(field_name, value): | |
| if not value: | |
| return None | |
| # Bersihkan nilai dari karakter non-alphanumeric di awal dan akhir | |
| value = re.sub(r'^[^\w]+|[^\w]+$', '', value).strip() | |
| if not value: | |
| return None | |
| # Validasi khusus untuk field tertentu | |
| if field_name == "nama_pelapor": | |
| # Nama pelapor harus berisi setidaknya 2 karakter alphabet | |
| if len(re.findall(r'[a-zA-Z]', value)) < 2: | |
| return None | |
| # Hapus karakter seperti |, /, \, dll dari nama | |
| value = re.sub(r'[|/\\]', '', value).strip() | |
| elif field_name == "tanggal": | |
| # Tanggal harus mengandung angka atau format tanggal | |
| if not re.search(r'\d{1,4}[-/]\d{1,2}[-/]\d{1,4}|\d{1,2}[-/\s]+\w+[-/\s]+\d{2,4}|\d{2}[-/]\d{2}[-/]\d{2,4}', value): | |
| # Coba cari angka tanggal dalam string | |
| date_match = re.search(r'\d{1,2}[-/\s]+\d{1,2}[-/\s]+\d{2,4}', value) | |
| if date_match: | |
| value = date_match.group(0) | |
| else: | |
| return None | |
| elif field_name == "lokasi": | |
| # Lokasi harus berisi lebih dari 2 karakter | |
| if len(value) <= 2: | |
| return None | |
| elif field_name in ["bahaya", "uraian_pengamatan", "tindakan_intervensi"]: | |
| # Teks deskripsi harus cukup panjang dan relevan | |
| if len(value) < 3 or "No." in value or "Revisi" in value or "FM-" in value: | |
| return None | |
| # Pastikan tidak ada nilai placeholder atau sampah | |
| placeholders = ["...", "___", "N/A", "-", "--", "diisi oleh", "xxx"] | |
| for placeholder in placeholders: | |
| if placeholder in value.lower(): | |
| return None | |
| # Hapus tanda | yang sering hadir di awal atau akhir | |
| value = re.sub(r'^\s*\|\s*|\s*\|\s*$', '', value).strip() | |
| return value | |
| # Fungsi untuk parsing teks LSB menjadi struktur data | |
| def parse_lsb_form(text): | |
| # Preprocessing teks | |
| text = clean_ocr_text(text) | |
| # Inisialisasi dictionary untuk menyimpan hasil | |
| result = {} | |
| # Parsing dasar dari teks OCR menjadi field-field | |
| if "LAPORAN SUMBER BAHAYA" in text: | |
| result["jenis_dokumen"] = "LAPORAN SUMBER BAHAYA" | |
| # Pattern regex yang lebih baik untuk menemukan field-field umum pada form LSB | |
| patterns = { | |
| "nama_pelapor": r"(?:NAMA\s*PELAPOR|PELAPOR)[^A-Za-z0-9]*\s*([^\n|]{2,40})", | |
| "lokasi": r"(?:LOKASI\s*KEJADIAN|LOKASI)[^A-Za-z0-9]*\s*([^\n|]{2,50})", | |
| "tanggal": r"(?:TANGGAL\s*/?\s*WAKTU|TANGGAL)[^A-Za-z0-9]*\s*([^\n|]{2,30})", | |
| "posisi_jabatan": r"(?:POSISI\s*/?\s*JABATAN|JABATAN)[^A-Za-z0-9]*\s*([^\n|]{2,40})", | |
| "jenis_pengamatan": r"(?:JENIS\s*PENGAMATAN)[^A-Za-z0-9]*\s*([^\n|]{2,50})", | |
| "uraian_pengamatan": r"(?:URAIAN\s*PENGAMATAN)[^A-Za-z0-9]*\s*([^\n|]{2,100})", | |
| "bahaya": r"(?:BAHAYA)[^A-Za-z0-9]*\s*([^\n|]{2,100})", | |
| "tindakan_intervensi": r"(?:TINDAKAN\s*INTERVENSI)[^A-Za-z0-9/]*\s*([^\n|]{2,100})", | |
| "saran_perbaikan": r"(?:SARAN\s*PERBAIKAN)[^A-Za-z0-9:]*\s*([^\n|]{2,100})" | |
| } | |
| # Cari semua pola dalam teks | |
| for field_name, pattern in patterns.items(): | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| value = match.group(1).strip() | |
| # Validasi dan bersihkan nilai | |
| clean_value = validate_field(field_name, value) | |
| if clean_value: | |
| result[field_name] = clean_value | |
| # Deteksi jenis pengamatan melalui checkbox | |
| # Cek untuk Unsafe Condition | |
| if "jenis_pengamatan" not in result or not result["jenis_pengamatan"]: | |
| unsafe_condition = re.search(r'(?:Unsafe\s*Condition|Kondisi\s*Tidak\s*Aman|Unsafe\s*C)', text, re.IGNORECASE) | |
| unsafe_action = re.search(r'(?:Unsafe\s*Action|Tindakan\s*Tidak\s*Aman|Unsafe\s*A)', text, re.IGNORECASE) | |
| intervensi = re.search(r'(?:Intervensi|Intervention)', text, re.IGNORECASE) | |
| if unsafe_condition: | |
| result["jenis_pengamatan"] = "Unsafe Condition" | |
| elif unsafe_action: | |
| result["jenis_pengamatan"] = "Unsafe Action" | |
| elif intervensi: | |
| result["jenis_pengamatan"] = "Intervensi" | |
| # Deteksi nomor LSB jika ada | |
| no_lsb_match = re.search(r"No\.\s*LSB\s*:?\s*([a-zA-Z0-9_\-/\.]+)", text, re.IGNORECASE) | |
| if no_lsb_match: | |
| no_lsb = no_lsb_match.group(1).strip() | |
| if "diisi oleh" not in no_lsb.lower(): | |
| result["no_lsb"] = no_lsb | |
| # Ekstraksi tambahan dari baris-baris teks | |
| lines = text.split('\n') | |
| # Dictionary untuk menyimpan kunci pencarian dan nama field | |
| field_mappings = { | |
| "Tanggal": "tanggal", | |
| "Lokasi": "lokasi", | |
| "Nama Pelapor": "nama_pelapor", | |
| "Unit/Dept": "unit_dept", | |
| "Jenis Pengamatan": "jenis_pengamatan", | |
| "Kondisi": "kondisi_bahaya", | |
| "Tindakan": "tindakan_bahaya", | |
| "Intervensi": "intervensi", | |
| "Deskripsi": "deskripsi", | |
| "Usulan": "usulan_perbaikan", | |
| "Tindak Lanjut": "tindak_lanjut", | |
| } | |
| current_field = None | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # Cek apakah line mengandung salah satu field | |
| found_field = False | |
| for key, field_name in field_mappings.items(): | |
| if key in line and ":" in line: | |
| # Ekstrak nilai setelah ":" | |
| parts = line.split(":", 1) | |
| if len(parts) > 1: | |
| value = parts[1].strip() | |
| clean_value = validate_field(field_name, value) | |
| if clean_value and (field_name not in result or not result[field_name]): | |
| result[field_name] = clean_value | |
| current_field = field_name | |
| found_field = True | |
| break | |
| # Jika tidak ada field baru, tambahkan ke field sebelumnya | |
| if not found_field and current_field and line: | |
| if current_field in result: | |
| # Cek apakah baris ini relevan untuk field saat ini | |
| if len(line) > 2 and "diisi oleh" not in line.lower(): | |
| result[current_field] += " " + line | |
| # Validasi dan pembersihan akhir hasil ekstraksi | |
| final_result = {} | |
| for field, value in result.items(): | |
| clean_value = validate_field(field, value) | |
| if clean_value: | |
| final_result[field] = clean_value | |
| return final_result | |
| # Fungsi untuk API predict yang menerima JSON dengan base64 image | |
| def api_predict(json_input): | |
| try: | |
| # Parse input JSON | |
| if isinstance(json_input, str): | |
| import json | |
| data = json.loads(json_input) | |
| else: | |
| data = json_input | |
| # Cek struktur data | |
| if not isinstance(data, dict) or "data" not in data or not isinstance(data["data"], list) or len(data["data"]) == 0: | |
| return {"status": "error", "message": "Invalid input format. Expected {\"data\": [\"BASE64_IMAGE\"]}"} | |
| # Ambil base64 image | |
| base64_img = data["data"][0] | |
| # Proses gambar | |
| image = process_base64_image(base64_img) | |
| if image is None: | |
| return {"status": "error", "message": "Failed to decode base64 image"} | |
| # Lakukan OCR | |
| if TESSERACT_AVAILABLE: | |
| # Konfigurasi tambahan untuk OCR | |
| config = '--psm 4 --oem 3' # Assume page has multiple columns of text | |
| text = pytesseract.image_to_string(image, lang='ind', config=config) | |
| else: | |
| text = "⚠️ OCR tidak dapat diproses karena Tesseract tidak tersedia di Space ini." | |
| # Parse hasil | |
| extracted_data = parse_lsb_form(text) | |
| # Gabungkan hasil | |
| result = { | |
| "raw_text": text, | |
| "status": "success" | |
| } | |
| result.update(extracted_data) | |
| return result | |
| except Exception as e: | |
| import traceback | |
| trace = traceback.format_exc() | |
| return {"status": "error", "message": str(e), "trace": trace} | |
| # Create FastAPI app | |
| app = FastAPI( | |
| title="LSB OCR API", | |
| description="API for OCR of LSB documents", | |
| version="1.0.0" | |
| ) | |
| async def read_main(): | |
| return { | |
| "message": "Welcome to LSB OCR API", | |
| "endpoints": { | |
| "/api/predict": "POST - Analyze LSB images using OCR", | |
| "/status": "GET - Check API status" | |
| } | |
| } | |
| async def status(): | |
| return { | |
| "status": "running", | |
| "tesseract_available": TESSERACT_AVAILABLE, | |
| "version": "1.0.0" | |
| } | |
| async def predict_route(request: Request): | |
| try: | |
| # Get request body | |
| body = await request.json() | |
| # Process with api_predict function | |
| result = api_predict(body) | |
| # Return result | |
| return result | |
| except Exception as e: | |
| return {"status": "error", "message": str(e)} | |
| # Run the FastAPI app | |
| if __name__ == "__main__": | |
| # For local development | |
| port = int(os.environ.get("PORT", 7860)) | |
| uvicorn.run(app, host="0.0.0.0", port=port) |