frendyrachman commited on
Commit
d9b382b
Β·
verified Β·
1 Parent(s): f14409b

Update app.py

Browse files

Update feature:

1. Downloadable ZIP contains a valid-only documents.
2. Downloadable DOCX contains Gemini Analysis Result for document checking.
3. Added a list of valid documents as printed text.

Files changed (1) hide show
  1. app.py +101 -40
app.py CHANGED
@@ -10,6 +10,23 @@ os.system("apt-get install poppler-utils")
10
  import datetime
11
  from docx import Document
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Function to process a list of PDF files and convert them to images
14
  def process_pdfs(pdf_files):
15
  """
@@ -20,7 +37,7 @@ def process_pdfs(pdf_files):
20
  for pdf_file in pdf_files:
21
  if not os.path.isfile(pdf_file):
22
  raise ValueError(f"File {pdf_file} does not exist.")
23
-
24
  images = convert_from_path(pdf_file, dpi=200) # Convert PDF pages to images
25
  all_images.extend(images)
26
 
@@ -83,7 +100,7 @@ def gemini_analysis(images, tanggal_berangkat, tanggal_pulang):
83
  11. Slip Gaji
84
  3 bulan terakhir
85
  Jika suami lengkap β†’ istri cukup lampirkan rekening koran suami
86
-
87
  ---
88
  FORMAT JAWABAN UNTUK SETIAP DOKUMEN YANG DIUPLOAD:
89
  - Jenis Dokumen : (jenis dokumen)
@@ -99,33 +116,77 @@ def gemini_analysis(images, tanggal_berangkat, tanggal_pulang):
99
  Halo pak/bu (nama peserta), berikut ini rangkuman hasil pemeriksaan dokumen pengajuan visa anda: ...
100
  '''
101
 
102
- # Perform content generation using Google Gemini (images passed as files)
103
- response = client.models.generate_content(
104
  model="gemini-1.5-flash",
105
  contents=[prompt] + images # Pass prompt and image files
106
  )
107
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- def extract_zip_and_collect_files(zip_file_path):
111
- """
112
- Extract zip file to a temp directory and return list of pdf/image file paths inside.
113
- """
114
  temp_dir = tempfile.mkdtemp()
115
- with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
116
- zip_ref.extractall(temp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- # Collect all pdf and image files in extracted folder recursively
119
- collected_files = []
120
- for root, _, files in os.walk(temp_dir):
121
- for f in files:
122
- if f.lower().endswith(('.pdf', '.jpg', '.jpeg', '.png')):
123
- collected_files.append(os.path.join(root, f))
124
- return collected_files
125
 
126
 
127
  def main_process(files, tanggal_berangkat, tanggal_pulang):
128
  all_images = []
 
129
 
130
  for file in files:
131
  file_path = file.name if hasattr(file, 'name') else file
@@ -148,35 +209,28 @@ def main_process(files, tanggal_berangkat, tanggal_pulang):
148
  else:
149
  raise ValueError(f"File {file_path} is not a valid image, PDF, or ZIP.")
150
 
151
- summary = gemini_analysis(all_images, tanggal_berangkat, tanggal_pulang)
 
152
 
153
- # πŸ“„ Create DOCX with custom filename
154
  doc = Document()
155
  doc.add_heading("Visa Document Check Summary", level=1)
156
  for line in summary.split("\n"):
157
  doc.add_paragraph(line)
158
-
159
- # Use first input file name for filename
160
  first_file = files[0]
161
  first_filename = os.path.basename(first_file.name if hasattr(first_file, 'name') else first_file)
162
  base_name = os.path.splitext(first_filename)[0]
163
  docx_filename = f"summary_{base_name}.docx"
164
  temp_docx_path = os.path.join(tempfile.gettempdir(), docx_filename)
165
-
166
  doc.save(temp_docx_path)
167
 
168
- return summary, temp_docx_path
169
-
170
- # Save to DOCX
171
- doc = Document()
172
- doc.add_heading("Visa Document Check Summary", level=1)
173
- for line in summary.split("\n"):
174
- doc.add_paragraph(line)
175
-
176
- temp_docx_path = os.path.join(tempfile.gettempdir(), f"summary_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.docx")
177
- doc.save(temp_docx_path)
178
 
179
- return summary, temp_docx_path
180
 
181
 
182
  # Gradio UI update: add ".zip" to accepted file types
@@ -200,11 +254,18 @@ with gr.Blocks() as demo:
200
  placeholder="Masukan Tanggal Kepulangan",
201
  type="text"
202
  )
 
203
  run_btn = gr.Button("πŸƒ Run Analysis")
204
- download_output = gr.File(label="πŸ“₯ Download Summary as DOCX", visible=True)
205
- output = gr.Textbox(label="πŸ“ Summary Result", lines=50)
 
 
 
 
 
 
 
 
 
206
 
207
- run_btn.click(fn=main_process,
208
- inputs=[file_input, tanggal_berangkat, tanggal_pulang],
209
- outputs=[output, download_output])
210
- demo.launch()
 
10
  import datetime
11
  from docx import Document
12
 
13
+
14
+ def extract_zip_and_collect_files(zip_file_path):
15
+ """
16
+ Extract zip file to a temp directory and return list of pdf/image file paths inside.
17
+ """
18
+ temp_dir = tempfile.mkdtemp()
19
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
20
+ zip_ref.extractall(temp_dir)
21
+
22
+ # Collect all pdf and image files in extracted folder recursively
23
+ collected_files = []
24
+ for root, _, files in os.walk(temp_dir):
25
+ for f in files:
26
+ if f.lower().endswith(('.pdf', '.jpg', '.jpeg', '.png')):
27
+ collected_files.append(os.path.join(root, f))
28
+ return collected_files
29
+
30
  # Function to process a list of PDF files and convert them to images
31
  def process_pdfs(pdf_files):
32
  """
 
37
  for pdf_file in pdf_files:
38
  if not os.path.isfile(pdf_file):
39
  raise ValueError(f"File {pdf_file} does not exist.")
40
+
41
  images = convert_from_path(pdf_file, dpi=200) # Convert PDF pages to images
42
  all_images.extend(images)
43
 
 
100
  11. Slip Gaji
101
  3 bulan terakhir
102
  Jika suami lengkap β†’ istri cukup lampirkan rekening koran suami
103
+
104
  ---
105
  FORMAT JAWABAN UNTUK SETIAP DOKUMEN YANG DIUPLOAD:
106
  - Jenis Dokumen : (jenis dokumen)
 
116
  Halo pak/bu (nama peserta), berikut ini rangkuman hasil pemeriksaan dokumen pengajuan visa anda: ...
117
  '''
118
 
119
+ # Perform document analysis
120
+ response_1 = client.models.generate_content(
121
  model="gemini-1.5-flash",
122
  contents=[prompt] + images # Pass prompt and image files
123
  )
124
+ analysis = response_1.text
125
+ # Perform valid list generation
126
+ prompt_2 = "Buatkan daftar dokumen yang valid saja dalam bentuk list dalam 1 kata di antara contoh berikut: 'PASPOR', 'PAS_FOTO', 'KARTU_KELUARGA', 'BUKTI_NIKAH', 'KTP', 'SURAT_KELAHIRAN', 'SURAT_SPONSOR', 'SURAT_KERJA', 'NIB', 'SIUP', 'SURAT_JAMINAN_STAF', 'SURAT_SEKOLAH', 'SURAT_PENDIDIKAN', 'KONTRAK_KERJA', 'SURAT_PENSIUN', 'REKENING_KORAN', 'SLIP_GAJI'"
127
+ response_2 = client.models.generate_content(
128
+ model="gemini-1.5-flash",
129
+ contents=[response_1.text, prompt_2] # Pass prompt and image files
130
+ )
131
+ valid_docs_list_raw = response_2.text.strip()
132
+ if '\n' in valid_docs_list_raw:
133
+ docs_list = [line.strip("-β€’ ").strip() for line in valid_docs_list_raw.splitlines() if line.strip()]
134
+ elif ',' in valid_docs_list_raw:
135
+ docs_list = [item.strip() for item in valid_docs_list_raw.split(",") if item.strip()]
136
+ else:
137
+ # fallback: jadikan 1 elemen list
138
+ docs_list = [valid_docs_list_raw]
139
+ return analysis, docs_list
140
 
141
+ # function to ask gemini to rename the image based on detected
142
+ def image_file_naming(images):
143
+ renamed_images = []
144
+ client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
145
+
146
+ for i, image in enumerate(images):
147
+ prompt_3 = "Analisa dokumen berikut dan beri nama dalam 1 nama dokumen pasti antara: 'PASPOR', 'PAS_FOTO', 'KARTU_KELUARGA', 'BUKTI_NIKAH', 'KTP', 'SURAT_KELAHIRAN', 'SURAT_SPONSOR', 'SURAT_KERJA', 'NIB', 'SIUP', 'SURAT_JAMINAN_STAF', 'SURAT_SEKOLAH', 'SURAT_PENDIDIKAN', 'KONTRAK_KERJA', 'SURAT_PENSIUN', 'REKENING_KORAN', 'SLIP_GAJI'"
148
+ response_3 = client.models.generate_content(
149
+ model="gemini-1.5-flash",
150
+ contents=[prompt_3, image] # Pass prompt and image files
151
+ )
152
+ new_image_filename = response_3.text.strip().upper() + ".jpg"
153
+ renamed_images.append({"image": image, "filename": new_image_filename})
154
+
155
+ return renamed_images
156
 
157
+ def save_images_to_zip(images_info, zip_name="filtered_images.zip"):
158
+ from PIL import Image
 
 
159
  temp_dir = tempfile.mkdtemp()
160
+ zip_path = os.path.join(tempfile.gettempdir(), zip_name)
161
+
162
+ # Step 1: Kelompokkan gambar berdasarkan nama dokumen
163
+ grouped = {}
164
+ for item in images_info:
165
+ name = os.path.splitext(item["filename"])[0] # tanpa .jpg
166
+ grouped.setdefault(name, []).append(item["image"])
167
+
168
+ # Step 2: Simpan tiap grup sebagai PDF
169
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
170
+ for doc_name, images in grouped.items():
171
+ # pastikan semua dalam RGB
172
+ images_rgb = [img.convert("RGB") for img in images]
173
+ pdf_path = os.path.join(temp_dir, f"{doc_name}.pdf")
174
+
175
+ if len(images_rgb) == 1:
176
+ images_rgb[0].save(pdf_path, save_all=True)
177
+ else:
178
+ images_rgb[0].save(pdf_path, save_all=True, append_images=images_rgb[1:])
179
+
180
+ zipf.write(pdf_path, arcname=f"{doc_name}.pdf")
181
+
182
+ return zip_path
183
+
184
 
 
 
 
 
 
 
 
185
 
186
 
187
  def main_process(files, tanggal_berangkat, tanggal_pulang):
188
  all_images = []
189
+ image_paths_for_zip = []
190
 
191
  for file in files:
192
  file_path = file.name if hasattr(file, 'name') else file
 
209
  else:
210
  raise ValueError(f"File {file_path} is not a valid image, PDF, or ZIP.")
211
 
212
+ # Generate summary from images
213
+ summary, valid_list = gemini_analysis(all_images, tanggal_berangkat, tanggal_pulang)
214
 
215
+ # Create DOCX for summary output
216
  doc = Document()
217
  doc.add_heading("Visa Document Check Summary", level=1)
218
  for line in summary.split("\n"):
219
  doc.add_paragraph(line)
 
 
220
  first_file = files[0]
221
  first_filename = os.path.basename(first_file.name if hasattr(first_file, 'name') else first_file)
222
  base_name = os.path.splitext(first_filename)[0]
223
  docx_filename = f"summary_{base_name}.docx"
224
  temp_docx_path = os.path.join(tempfile.gettempdir(), docx_filename)
 
225
  doc.save(temp_docx_path)
226
 
227
+ # Filtering the file
228
+ renamed_images_info = image_file_naming(all_images)
229
+ # Filter hanya yang valid
230
+ images_to_zip = [img_info for img_info in renamed_images_info if os.path.splitext(img_info["filename"])[0] in valid_list]
231
+ zip_file_path = save_images_to_zip(images_to_zip)
 
 
 
 
 
232
 
233
+ return temp_docx_path, valid_list, zip_file_path
234
 
235
 
236
  # Gradio UI update: add ".zip" to accepted file types
 
254
  placeholder="Masukan Tanggal Kepulangan",
255
  type="text"
256
  )
257
+
258
  run_btn = gr.Button("πŸƒ Run Analysis")
259
+ output = gr.Textbox(label="πŸ“ Valid Document List", lines=5)
260
+
261
+ with gr.Row():
262
+ download_output_docx = gr.File(label="πŸ“₯ Download Summary as DOCX", visible=True)
263
+ download_valid_zip = gr.File(label="πŸ“₯ Download Valid Only document in zip", visible=True)
264
+
265
+ run_btn.click(
266
+ fn=main_process,
267
+ inputs=[file_input, tanggal_berangkat, tanggal_pulang],
268
+ outputs=[download_output_docx, output, download_valid_zip]
269
+ )
270
 
271
+ demo.launch(debug=True)