Spaces:

frendyrachman
/

document_analyzer

Sleeping

App Files Files Community

frendyrachman commited on May 20, 2025

Commit

d9b382b

verified ·

1 Parent(s): f14409b

Update app.py

Browse files

Update feature:

1. Downloadable ZIP contains a valid-only documents.
2. Downloadable DOCX contains Gemini Analysis Result for document checking.
3. Added a list of valid documents as printed text.

Files changed (1) hide show

app.py +101 -40

app.py CHANGED Viewed

@@ -10,6 +10,23 @@ os.system("apt-get install poppler-utils")
 import datetime
 from docx import Document
 # Function to process a list of PDF files and convert them to images
 def process_pdfs(pdf_files):
     """
@@ -20,7 +37,7 @@ def process_pdfs(pdf_files):
     for pdf_file in pdf_files:
         if not os.path.isfile(pdf_file):
             raise ValueError(f"File {pdf_file} does not exist.")
         images = convert_from_path(pdf_file, dpi=200)  # Convert PDF pages to images
         all_images.extend(images)
@@ -83,7 +100,7 @@ def gemini_analysis(images, tanggal_berangkat, tanggal_pulang):
     11. Slip Gaji
     3 bulan terakhir
     Jika suami lengkap → istri cukup lampirkan rekening koran suami
     ---
     FORMAT JAWABAN UNTUK SETIAP DOKUMEN YANG DIUPLOAD:
     - Jenis Dokumen : (jenis dokumen)
@@ -99,33 +116,77 @@ def gemini_analysis(images, tanggal_berangkat, tanggal_pulang):
     Halo pak/bu (nama peserta), berikut ini rangkuman hasil pemeriksaan dokumen pengajuan visa anda: ...
     '''
-    # Perform content generation using Google Gemini (images passed as files)
-    response = client.models.generate_content(
         model="gemini-1.5-flash",
         contents=[prompt] + images  # Pass prompt and image files
     )
-    return response.text
-def extract_zip_and_collect_files(zip_file_path):
-    """
-    Extract zip file to a temp directory and return list of pdf/image file paths inside.
-    """
     temp_dir = tempfile.mkdtemp()
-    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
-        zip_ref.extractall(temp_dir)
-    # Collect all pdf and image files in extracted folder recursively
-    collected_files = []
-    for root, _, files in os.walk(temp_dir):
-        for f in files:
-            if f.lower().endswith(('.pdf', '.jpg', '.jpeg', '.png')):
-                collected_files.append(os.path.join(root, f))
-    return collected_files
 def main_process(files, tanggal_berangkat, tanggal_pulang):
     all_images = []
     for file in files:
         file_path = file.name if hasattr(file, 'name') else file
@@ -148,35 +209,28 @@ def main_process(files, tanggal_berangkat, tanggal_pulang):
         else:
             raise ValueError(f"File {file_path} is not a valid image, PDF, or ZIP.")
-    summary = gemini_analysis(all_images, tanggal_berangkat, tanggal_pulang)
-    # 📄 Create DOCX with custom filename
     doc = Document()
     doc.add_heading("Visa Document Check Summary", level=1)
     for line in summary.split("\n"):
         doc.add_paragraph(line)
-    # Use first input file name for filename
     first_file = files[0]
     first_filename = os.path.basename(first_file.name if hasattr(first_file, 'name') else first_file)
     base_name = os.path.splitext(first_filename)[0]
     docx_filename = f"summary_{base_name}.docx"
     temp_docx_path = os.path.join(tempfile.gettempdir(), docx_filename)
     doc.save(temp_docx_path)
-    return summary, temp_docx_path
-    # Save to DOCX
-    doc = Document()
-    doc.add_heading("Visa Document Check Summary", level=1)
-    for line in summary.split("\n"):
-        doc.add_paragraph(line)
-    temp_docx_path = os.path.join(tempfile.gettempdir(), f"summary_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.docx")
-    doc.save(temp_docx_path)
-    return summary, temp_docx_path
 # Gradio UI update: add ".zip" to accepted file types
@@ -200,11 +254,18 @@ with gr.Blocks() as demo:
             placeholder="Masukan Tanggal Kepulangan",
             type="text"
         )
     run_btn = gr.Button("🏃 Run Analysis")
-    download_output = gr.File(label="📥 Download Summary as DOCX", visible=True)
-    output = gr.Textbox(label="📝 Summary Result", lines=50)
-    run_btn.click(fn=main_process,
-                  inputs=[file_input, tanggal_berangkat, tanggal_pulang],
-                  outputs=[output, download_output])
-demo.launch()

 import datetime
 from docx import Document
+def extract_zip_and_collect_files(zip_file_path):
+    """
+    Extract zip file to a temp directory and return list of pdf/image file paths inside.
+    """
+    temp_dir = tempfile.mkdtemp()
+    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+    # Collect all pdf and image files in extracted folder recursively
+    collected_files = []
+    for root, _, files in os.walk(temp_dir):
+        for f in files:
+            if f.lower().endswith(('.pdf', '.jpg', '.jpeg', '.png')):
+                collected_files.append(os.path.join(root, f))
+    return collected_files
 # Function to process a list of PDF files and convert them to images
 def process_pdfs(pdf_files):
     """
     for pdf_file in pdf_files:
         if not os.path.isfile(pdf_file):
             raise ValueError(f"File {pdf_file} does not exist.")
         images = convert_from_path(pdf_file, dpi=200)  # Convert PDF pages to images
         all_images.extend(images)
     11. Slip Gaji
     3 bulan terakhir
     Jika suami lengkap → istri cukup lampirkan rekening koran suami
     ---
     FORMAT JAWABAN UNTUK SETIAP DOKUMEN YANG DIUPLOAD:
     - Jenis Dokumen : (jenis dokumen)
     Halo pak/bu (nama peserta), berikut ini rangkuman hasil pemeriksaan dokumen pengajuan visa anda: ...
     '''
+    # Perform document analysis
+    response_1 = client.models.generate_content(
         model="gemini-1.5-flash",
         contents=[prompt] + images  # Pass prompt and image files
     )
+    analysis = response_1.text
+    # Perform valid list generation
+    prompt_2 = "Buatkan daftar dokumen yang valid saja dalam bentuk list dalam 1 kata di antara contoh berikut: 'PASPOR', 'PAS_FOTO', 'KARTU_KELUARGA', 'BUKTI_NIKAH', 'KTP', 'SURAT_KELAHIRAN', 'SURAT_SPONSOR', 'SURAT_KERJA', 'NIB', 'SIUP', 'SURAT_JAMINAN_STAF', 'SURAT_SEKOLAH', 'SURAT_PENDIDIKAN', 'KONTRAK_KERJA', 'SURAT_PENSIUN', 'REKENING_KORAN', 'SLIP_GAJI'"
+    response_2 = client.models.generate_content(
+        model="gemini-1.5-flash",
+        contents=[response_1.text, prompt_2]  # Pass prompt and image files
+    )
+    valid_docs_list_raw = response_2.text.strip()
+    if '\n' in valid_docs_list_raw:
+      docs_list = [line.strip("-• ").strip() for line in valid_docs_list_raw.splitlines() if line.strip()]
+    elif ',' in valid_docs_list_raw:
+      docs_list = [item.strip() for item in valid_docs_list_raw.split(",") if item.strip()]
+    else:
+      # fallback: jadikan 1 elemen list
+      docs_list = [valid_docs_list_raw]
+    return analysis, docs_list
+# function to ask gemini to rename the image based on detected
+def image_file_naming(images):
+    renamed_images = []
+    client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
+    for i, image in enumerate(images):
+        prompt_3 = "Analisa dokumen berikut dan beri nama dalam 1 nama dokumen pasti antara: 'PASPOR', 'PAS_FOTO', 'KARTU_KELUARGA', 'BUKTI_NIKAH', 'KTP', 'SURAT_KELAHIRAN', 'SURAT_SPONSOR', 'SURAT_KERJA', 'NIB', 'SIUP', 'SURAT_JAMINAN_STAF', 'SURAT_SEKOLAH', 'SURAT_PENDIDIKAN', 'KONTRAK_KERJA', 'SURAT_PENSIUN', 'REKENING_KORAN', 'SLIP_GAJI'"
+        response_3 = client.models.generate_content(
+          model="gemini-1.5-flash",
+          contents=[prompt_3, image]  # Pass prompt and image files
+        )
+        new_image_filename = response_3.text.strip().upper() + ".jpg"
+        renamed_images.append({"image": image, "filename": new_image_filename})
+    return renamed_images
+def save_images_to_zip(images_info, zip_name="filtered_images.zip"):
+    from PIL import Image
     temp_dir = tempfile.mkdtemp()
+    zip_path = os.path.join(tempfile.gettempdir(), zip_name)
+    # Step 1: Kelompokkan gambar berdasarkan nama dokumen
+    grouped = {}
+    for item in images_info:
+        name = os.path.splitext(item["filename"])[0]  # tanpa .jpg
+        grouped.setdefault(name, []).append(item["image"])
+    # Step 2: Simpan tiap grup sebagai PDF
+    with zipfile.ZipFile(zip_path, 'w') as zipf:
+        for doc_name, images in grouped.items():
+            # pastikan semua dalam RGB
+            images_rgb = [img.convert("RGB") for img in images]
+            pdf_path = os.path.join(temp_dir, f"{doc_name}.pdf")
+            if len(images_rgb) == 1:
+                images_rgb[0].save(pdf_path, save_all=True)
+            else:
+                images_rgb[0].save(pdf_path, save_all=True, append_images=images_rgb[1:])
+            zipf.write(pdf_path, arcname=f"{doc_name}.pdf")
+    return zip_path
 def main_process(files, tanggal_berangkat, tanggal_pulang):
     all_images = []
+    image_paths_for_zip = []
     for file in files:
         file_path = file.name if hasattr(file, 'name') else file
         else:
             raise ValueError(f"File {file_path} is not a valid image, PDF, or ZIP.")
+    # Generate summary from images
+    summary, valid_list = gemini_analysis(all_images, tanggal_berangkat, tanggal_pulang)
+    # Create DOCX for summary output
     doc = Document()
     doc.add_heading("Visa Document Check Summary", level=1)
     for line in summary.split("\n"):
         doc.add_paragraph(line)
     first_file = files[0]
     first_filename = os.path.basename(first_file.name if hasattr(first_file, 'name') else first_file)
     base_name = os.path.splitext(first_filename)[0]
     docx_filename = f"summary_{base_name}.docx"
     temp_docx_path = os.path.join(tempfile.gettempdir(), docx_filename)
     doc.save(temp_docx_path)
+    # Filtering the file
+    renamed_images_info = image_file_naming(all_images)
+    # Filter hanya yang valid
+    images_to_zip = [img_info for img_info in renamed_images_info if os.path.splitext(img_info["filename"])[0] in valid_list]
+    zip_file_path = save_images_to_zip(images_to_zip)
+    return temp_docx_path, valid_list, zip_file_path
 # Gradio UI update: add ".zip" to accepted file types
             placeholder="Masukan Tanggal Kepulangan",
             type="text"
         )
     run_btn = gr.Button("🏃 Run Analysis")
+    output = gr.Textbox(label="📝 Valid Document List", lines=5)
+    with gr.Row():
+        download_output_docx = gr.File(label="📥 Download Summary as DOCX", visible=True)
+        download_valid_zip = gr.File(label="📥 Download Valid Only document in zip", visible=True)
+    run_btn.click(
+        fn=main_process,
+        inputs=[file_input, tanggal_berangkat, tanggal_pulang],
+        outputs=[download_output_docx, output, download_valid_zip]
+    )
+demo.launch(debug=True)