Unlimitedlevel19 commited on
Commit
a9e3e43
·
verified ·
1 Parent(s): 8677d8e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -0
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ import numpy as np
5
+ from PIL import Image
6
+ from fastapi import FastAPI, Request
7
+ import json
8
+ import uvicorn
9
+ try:
10
+ import pytesseract
11
+ TESSERACT_AVAILABLE = True
12
+ except ImportError:
13
+ TESSERACT_AVAILABLE = False
14
+ print("⚠️ Pytesseract tidak ditemukan. Menggunakan OCR fallback.")
15
+
16
+ # Fungsi untuk memproses string base64 menjadi gambar
17
+ def process_base64_image(base64_string):
18
+ try:
19
+ # Jika string dimulai dengan 'data:image', hapus header
20
+ if 'data:image' in base64_string:
21
+ base64_string = base64_string.split(',')[1]
22
+
23
+ # Decode base64 menjadi bytes
24
+ image_bytes = base64.b64decode(base64_string)
25
+
26
+ # Konversi bytes menjadi gambar PIL
27
+ image = Image.open(io.BytesIO(image_bytes))
28
+
29
+ return image
30
+ except Exception as e:
31
+ print(f"Error processing base64 image: {e}")
32
+ return None
33
+
34
+ # Fungsi untuk parsing teks LSB menjadi struktur data
35
+ def parse_lsb_form(text):
36
+ # Inisialisasi dictionary untuk menyimpan hasil
37
+ result = {}
38
+
39
+ # Parsing dasar dari teks OCR menjadi field-field
40
+ if "LAPORAN SUMBER BAHAYA" in text:
41
+ result["jenis_dokumen"] = "LAPORAN SUMBER BAHAYA"
42
+
43
+ # Pengolahan untuk mengekstrak informasi dari teks
44
+ lines = text.split('\n')
45
+
46
+ # Dictionary untuk menyimpan kunci pencarian dan nama field
47
+ field_mappings = {
48
+ "Tanggal": "tanggal",
49
+ "Lokasi": "lokasi",
50
+ "Nama Pelapor": "nama_pelapor",
51
+ "Unit/Dept": "unit_dept",
52
+ "Jenis Pengamatan": "jenis_pengamatan",
53
+ "Kondisi": "kondisi_bahaya",
54
+ "Tindakan": "tindakan_bahaya",
55
+ "Intervensi": "intervensi",
56
+ "Deskripsi": "deskripsi",
57
+ "Usulan": "usulan_perbaikan",
58
+ "Tindak Lanjut": "tindak_lanjut",
59
+ }
60
+
61
+ current_field = None
62
+
63
+ for line in lines:
64
+ line = line.strip()
65
+ if not line:
66
+ continue
67
+
68
+ # Cek apakah line mengandung salah satu field
69
+ found_field = False
70
+ for key, field_name in field_mappings.items():
71
+ if key in line and ":" in line:
72
+ # Ekstrak nilai setelah ":"
73
+ parts = line.split(":", 1)
74
+ if len(parts) > 1:
75
+ value = parts[1].strip()
76
+ result[field_name] = value
77
+ current_field = field_name
78
+ found_field = True
79
+ break
80
+
81
+ # Jika tidak ada field baru, tambahkan ke field sebelumnya
82
+ if not found_field and current_field and line:
83
+ if current_field in result:
84
+ result[current_field] += " " + line
85
+ else:
86
+ result[current_field] = line
87
+
88
+ return result
89
+
90
+ # Fungsi untuk API predict yang menerima JSON dengan base64 image
91
+ def api_predict(json_input):
92
+ try:
93
+ # Parse input JSON
94
+ if isinstance(json_input, str):
95
+ import json
96
+ data = json.loads(json_input)
97
+ else:
98
+ data = json_input
99
+
100
+ # Cek struktur data
101
+ if not isinstance(data, dict) or "data" not in data or not isinstance(data["data"], list) or len(data["data"]) == 0:
102
+ return {"status": "error", "message": "Invalid input format. Expected {\"data\": [\"BASE64_IMAGE\"]}"}
103
+
104
+ # Ambil base64 image
105
+ base64_img = data["data"][0]
106
+
107
+ # Proses gambar
108
+ image = process_base64_image(base64_img)
109
+ if image is None:
110
+ return {"status": "error", "message": "Failed to decode base64 image"}
111
+
112
+ # Lakukan OCR
113
+ if TESSERACT_AVAILABLE:
114
+ text = pytesseract.image_to_string(image, lang='ind')
115
+ else:
116
+ text = "⚠️ OCR tidak dapat diproses karena Tesseract tidak tersedia di Space ini."
117
+
118
+ # Parse hasil
119
+ extracted_data = parse_lsb_form(text)
120
+
121
+ # Gabungkan hasil
122
+ result = {
123
+ "raw_text": text,
124
+ "status": "success"
125
+ }
126
+ result.update(extracted_data)
127
+
128
+ return result
129
+ except Exception as e:
130
+ import traceback
131
+ trace = traceback.format_exc()
132
+ return {"status": "error", "message": str(e), "trace": trace}
133
+
134
+ # Create FastAPI app
135
+ app = FastAPI(
136
+ title="LSB OCR API",
137
+ description="API for OCR of LSB documents",
138
+ version="1.0.0"
139
+ )
140
+
141
+ @app.get("/")
142
+ async def read_main():
143
+ return {
144
+ "message": "Welcome to LSB OCR API",
145
+ "endpoints": {
146
+ "/api/predict": "POST - Analyze LSB images using OCR",
147
+ "/status": "GET - Check API status"
148
+ }
149
+ }
150
+
151
+ @app.get("/status")
152
+ async def status():
153
+ return {
154
+ "status": "running",
155
+ "tesseract_available": TESSERACT_AVAILABLE,
156
+ "version": "1.0.0"
157
+ }
158
+
159
+ @app.post("/api/predict")
160
+ async def predict_route(request: Request):
161
+ try:
162
+ # Get request body
163
+ body = await request.json()
164
+
165
+ # Process with api_predict function
166
+ result = api_predict(body)
167
+
168
+ # Return result
169
+ return result
170
+ except Exception as e:
171
+ return {"status": "error", "message": str(e)}
172
+
173
+ # Run the FastAPI app
174
+ if __name__ == "__main__":
175
+ # For local development
176
+ port = int(os.environ.get("PORT", 7860))
177
+ uvicorn.run(app, host="0.0.0.0", port=port)