Spaces:

protae5544
/

pdfk

Sleeping

File size: 31,504 Bytes

7c5d3b3
 
 
 
31f4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c5d3b3
 
9e2500c
 
057c155
9e2500c
 
 
 
31f4117
 
c5d7fde
 
9e2500c
c5d7fde
9e2500c
 
 
 
 
 
c5d7fde
 
 
9e2500c
 
 
 
 
c5d7fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2500c
c5d7fde
9e2500c
 
 
 
 
 
 
c5d7fde
9e2500c
 
 
 
057c155
9e2500c
 
057c155
9e2500c
 
c5d7fde
9e2500c
 
c5d7fde
 
 
 
 
 
 
 
 
 
 
 
 
9e2500c
 
 
 
057c155
 
 
 
c5d7fde
 
057c155
 
c5d7fde
 
057c155
 
 
 
 
 
 
 
c5d7fde
057c155
 
 
 
 
c5d7fde
 
057c155
 
c5d7fde
057c155
 
 
c5d7fde
 
057c155
c5d7fde
057c155
c5d7fde
057c155
c5d7fde
057c155
c5d7fde
057c155
c5d7fde
 
057c155
c5d7fde
 
057c155
 
c5d7fde
057c155
c5d7fde
057c155
 
c5d7fde
057c155
 
c5d7fde
057c155
 
c5d7fde
057c155
 
 
c5d7fde
 
057c155
c5d7fde
057c155
 
 
c5d7fde
 
 
 
 
057c155
 
c5d7fde
 
057c155
c5d7fde
057c155
 
c5d7fde
057c155
9e2500c
c5d7fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2500c
 
c5d7fde
9e2500c
c5d7fde
 
 
 
 
 
 
9e2500c
 
 
c5d7fde
 
9e2500c
 
c5d7fde
 
9e2500c
 
c5d7fde
9e2500c
 
 
c5d7fde
9e2500c
c5d7fde
9e2500c
 
c5d7fde
9e2500c
 
 
 
 
 
 
c5d7fde
 
 
9e2500c
c5d7fde
 
 
9e2500c
 
 
c5d7fde
 
9e2500c
 
 
c5d7fde
9e2500c
 
 
c5d7fde
 
 
9e2500c
 
 
 
c5d7fde
 
 
 
9e2500c
 
c5d7fde
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2500c
 
c5d7fde
 
 
 
9e2500c
c5d7fde
 
 
9e2500c
c5d7fde
 
9e2500c
c5d7fde
9e2500c
c5d7fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2500c
c5d7fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2500c
c5d7fde
 
 
 
 
 
 
 
 
9e2500c
c5d7fde
9e2500c
c5d7fde
 
 
 
 
 
9e2500c
 
c5d7fde
9e2500c
057c155
9e2500c
c5d7fde
 
 
 
 
 
 
 
9e2500c
 
 
c5d7fde
9e2500c
 
c5d7fde
9e2500c
 
c5d7fde
9e2500c
c5d7fde
9e2500c
 
 
c5d7fde
9e2500c
c5d7fde
 
 
 
9e2500c
 
 
c5d7fde
 
9e2500c
c5d7fde
 
 
 
 
 
9e2500c
 
057c155
c5d7fde
 
 
057c155
c5d7fde
 
 
057c155
 
c5d7fde
 
 
057c155
c5d7fde
 
 
 
 
 
 
057c155
 
c5d7fde
 
 
057c155
c5d7fde
 
 
 
057c155
c5d7fde
057c155
c5d7fde

# -*- coding: utf-8 -*-

# ==============================================================================
# PDF Form Filler & Template Generator with AI Enhancements
#
# คำอธิบาย:
#   - เครื่องมือนี้สร้างขึ้นด้วย Gradio สำหรับจัดการไฟล์ PDF และ CSV
#   - ความสามารถหลัก:
#     1. สร้าง Template (CSV/JSON) จากฟอร์ม PDF ที่มีอยู่
#     2. เติมข้อมูลจากไฟล์ CSV ลงในฟอร์ม PDF ทีละหลายๆ ไฟล์
#     3. หาก PDF ไม่มีฟอร์ม จะสร้าง PDF ใหม่จากข้อมูลในแต่ละแถวของ CSV
#     4. (ทางเลือก) ใช้ AI และ OCR เพื่อแปลงข้อมูลจากรูปภาพเป็น CSV
#     5. (ทางเลือก) ใช้ AI ช่วยแนะนำการจับคู่คอลัมน์ CSV กับช่องใน PDF
#     6. (ทางเลือก) ใช้ AI ช่วยตรวจสอบและทำความสะอาดข้อมูลก่อนสร้าง PDF
#
# การติดตั้ง Dependencies:
#   - pip install gradio pandas PyPDF2 reportlab
#   - สำหรับฟีเจอร์ AI/OCR (ทางเลือก):
#     - pip install Pillow numpy opencv-python pytesseract
#     - ต้องติดตั้ง Tesseract OCR Engine ในระบบของคุณและตั้งค่า PATH ให้ถูกต้อง
#     - https://github.com/tesseract-ocr/tesseract
#   - สำหรับฟีเจอร์ SambaNova AI (ทางเลือก):
#     - pip install 'gradio_client>=0.12.0'
#
# ==============================================================================

import gradio as gr
import pandas as pd
import json
import io
import zipfile
from datetime import datetime
import traceback
import tempfile
import os
import sys
import subprocess

# --- ตรวจสอบและติดตั้ง Dependencies ---
try:
    from PyPDF2 import PdfReader, PdfWriter
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import letter
    from reportlab.pdfbase import pdfmetrics
    from reportlab.pdfbase.ttfonts import TTFont
    print("Dependencies หลักถูกติดตั้งเรียบร้อยแล้ว")
except ImportError:
    print("กำลังติดตั้ง dependencies ที่จำเป็น: PyPDF2, reportlab, pandas")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "PyPDF2", "reportlab", "pandas"])
    from PyPDF2 import PdfReader, PdfWriter
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import letter

# --- Dependencies เสริมสำหรับ AI และ OCR (จะแจ้งเตือนถ้าไม่มี) ---
try:
    from PIL import Image
    import numpy as np
    import cv2
    import pytesseract
    AI_OCR_ENABLED = True
    print("Dependencies สำหรับ AI/OCR พร้อมใช้งาน")
except ImportError:
    AI_OCR_ENABLED = False
    print("คำเตือน: ไม่พบ Dependencies สำหรับ AI/OCR (Pillow, numpy, opencv-python, pytesseract)")
    print("ฟังก์ชันที่เกี่ยวกับรูปภาพและ OCR จะไม่สามารถใช้งานได้")
    print("ติดตั้งด้วย: pip install Pillow numpy opencv-python pytesseract และติดตั้ง Tesseract engine")

try:
    from gradio_client import Client
    SAMBANOVA_AI_ENABLED = True
    print("Dependencies สำหรับ SambaNova AI พร้อมใช้งาน")
except ImportError:
    SAMBANOVA_AI_ENABLED = False
    print("คำเตือน: ไม่พบ Gradio Client (pip install 'gradio_client>=0.12.0')")
    print("ฟังก์ชันที่ต้องใช้ AI Model จะไม่สามารถใช้งานได้")


# ==============================================================================
# ส่วนของฟังก์ชันหลัก (Core Functions)
# ==============================================================================

def analyze_pdf_fields(pdf_path):
    """วิเคราะห์ฟิลด์ใน PDF และคืนค่าเป็น Dictionary"""
    try:
        reader = PdfReader(pdf_path)
        all_fields = {}
        # ตรวจสอบจาก AcroForm
        if reader.trailer.get("/Root") and reader.trailer["/Root"].get("/AcroForm"):
            acro_form = reader.trailer["/Root"]["/AcroForm"]
            if "/Fields" in acro_form:
                for field in acro_form["/Fields"]:
                    field_obj = field.get_object()
                    if "/T" in field_obj:
                        field_name = str(field_obj["/T"]).strip("()")
                        field_type = str(field_obj.get("/FT", "Unknown"))
                        field_value = str(field_obj.get("/V", "")).strip("()")
                        all_fields[field_name] = {
                            'type': field_type,
                            'default_value': field_value,
                            'method': 'AcroForm'
                        }
        # ตรวจสอบจาก Annotations ในแต่ละหน้า
        for page_num, page in enumerate(reader.pages):
            if "/Annots" in page:
                for annotation in page["/Annots"]:
                    annot_obj = annotation.get_object()
                    if annot_obj.get("/Subtype") == "/Widget" and "/T" in annot_obj:
                        field_name = str(annot_obj["/T"]).strip("()")
                        if field_name not in all_fields: # เพิ่มเฉพาะที่ยังไม่มี
                            field_type = str(annot_obj.get("/FT", "Widget"))
                            field_value = str(annot_obj.get("/V", "")).strip("()")
                            all_fields[field_name] = {
                                'type': field_type,
                                'default_value': field_value,
                                'page': page_num + 1,
                                'method': 'Annotation'
                            }
        return all_fields
    except Exception as e:
        return {"error": str(e)}

def generate_csv_template(pdf_fields, num_rows=5):
    """สร้าง CSV template จาก PDF fields"""
    if not pdf_fields or "error" in pdf_fields:
        return None, "ไม่สามารถสร้าง CSV template ได้"
    template_data = {'id': list(range(1, num_rows + 1))}
    for field_name in pdf_fields.keys():
        if field_name and field_name.strip():
            clean_name = field_name.strip()
            sample_value = f"ข้อมูลสำหรับ {clean_name} {{}}"
            template_data[clean_name] = [sample_value.format(i) for i in range(1, num_rows + 1)]
    df = pd.DataFrame(template_data)
    return df, "สร้าง CSV template สำเร็จ"

def generate_json_template(pdf_fields):
    """สร้าง JSON template จาก PDF fields"""
    if not pdf_fields or "error" in pdf_fields:
        return None, "ไม่สามารถสร้าง JSON template ได้"
    template = {
        "pdf_info": {"total_fields": len(pdf_fields), "generation_time": datetime.now().isoformat()},
        "fields": {},
        "sample_data": []
    }
    for field_name, field_info in pdf_fields.items():
        if field_name and field_name.strip():
            template["fields"][field_name.strip()] = field_info
    for i in range(1, 4):
        sample_record = {"id": i}
        for field_name in template["fields"].keys():
            sample_record[field_name] = f"ข้อมูลตัวอย่าง {i}"
        template["sample_data"].append(sample_record)
    return template, "สร้าง JSON template สำเร็จ"

def create_template_files(pdf_file, num_rows, progress=gr.Progress()):
    """สร้างไฟล์ template (CSV, JSON, README) และรวมเป็น ZIP"""
    if pdf_file is None:
        return None, "❌ กรุณาอัพโหลดไฟล์ PDF ก่อน"
    
    progress(0, desc="กำลังวิเคราะห์ PDF...")
    try:
        pdf_fields = analyze_pdf_fields(pdf_file.name)
        if not pdf_fields or "error" in pdf_fields:
            return None, "❌ ไม่พบ Form Fields ใน PDF หรือไฟล์เสียหาย"
        
        progress(0.3, desc="กำลังสร้าง CSV template...")
        csv_df, _ = generate_csv_template(pdf_fields, num_rows)
        
        progress(0.6, desc="กำลังสร้าง JSON template...")
        json_template, _ = generate_json_template(pdf_fields)
        
        if csv_df is None or json_template is None:
            return None, "❌ ไม่สามารถสร้างไฟล์ template ได้"
        
        progress(0.8, desc="กำลังบีบอัดไฟล์เป็น ZIP...")
        zip_buffer = io.BytesIO()
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_f:
            csv_buffer = io.StringIO()
            csv_df.to_csv(csv_buffer, index=False, encoding='utf-8-sig')
            zip_f.writestr(f"template_{timestamp}.csv", csv_buffer.getvalue())
            
            json_str = json.dumps(json_template, ensure_ascii=False, indent=2)
            zip_f.writestr(f"template_{timestamp}.json", json_str)
            
            readme_content = f"""# PDF Form Template Files
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
PDF Fields Found: {len(pdf_fields)}
{chr(10).join([f"- {name}" for name in pdf_fields.keys()])}
"""
            zip_f.writestr("README.txt", readme_content)
        
        zip_buffer.seek(0)
        
        # บันทึกไฟล์ ZIP ชั่วคราวเพื่อให้ Gradio ส่งให้ผู้ใช้ได้
        temp_dir = tempfile.gettempdir()
        zip_filename = f"pdf_templates_{timestamp}.zip"
        temp_zip_path = os.path.join(temp_dir, zip_filename)
        with open(temp_zip_path, "wb") as f:
            f.write(zip_buffer.getvalue())
        
        progress(1, desc="สร้างไฟล์สำเร็จ!")
        result_msg = f"✅ สร้าง template สำเร็จ!\n- พบ {len(pdf_fields)} fields\n- CSV มี {num_rows} แถวตัวอย่าง"
        
        return temp_zip_path, result_msg
        
    except Exception as e:
        return None, f"❌ เกิดข้อผิดพลาด: {e}\n{traceback.format_exc()}"

def fill_pdf_form(pdf_path, field_data):
    """เติมข้อมูลลงในฟอร์มของ PDF"""
    reader = PdfReader(pdf_path)
    writer = PdfWriter()
    writer.append_pages_from_reader(reader)
    
    # เติมข้อมูลในฟอร์ม
    for page in writer.pages:
        try:
            writer.update_page_form_field_values(page, field_data, auto_regenerate=False)
        except Exception:
            # บางครั้ง field อยูในระดับ root
            pass
    try: # ลองเติมที่ root อีกครั้ง
        writer.update_page_form_field_values(writer.pages[0], field_data)
    except:
        pass

    output_buffer = io.BytesIO()
    writer.write(output_buffer)
    output_buffer.seek(0)
    return output_buffer.getvalue()

def create_simple_pdf(data_row, filename):
    """สร้าง PDF ใหม่แบบง่ายๆ กรณีที่ PDF ต้นฉบับไม่มีฟอร์ม"""
    buffer = io.BytesIO()
    # ใช้ font ที่รองรับภาษาไทย
    try:
        pdfmetrics.registerFont(TTFont('THSarabunNew', 'THSarabunNew.ttf'))
        font_name = 'THSarabunNew'
    except:
        font_name = 'Helvetica' # Fallback

    p = canvas.Canvas(buffer, pagesize=letter)
    width, height = letter
    
    p.setFont(font_name, 16)
    p.drawString(50, height - 50, f"เอกสาร: {filename.replace('.pdf', '')}")
    p.line(50, height - 60, 550, height - 60)
    
    y_position = height - 90
    p.setFont(font_name, 12)
    for column, value in data_row.items():
        if pd.notna(value) and str(value).strip():
            text = f"{str(column).strip()}: {str(value).strip()}"
            try:
                p.drawString(50, y_position, text)
            except:
                safe_text = text.encode('latin-1', 'replace').decode('latin-1')
                p.drawString(50, y_position, safe_text)
            y_position -= 20
            if y_position < 50:
                p.showPage()
                p.setFont(font_name, 12)
                y_position = height - 50
    
    p.save()
    buffer.seek(0)
    return buffer.getvalue()

def read_csv_safe(csv_file):
    """อ่านไฟล์ CSV โดยลองหลาย encoding และ separator เพื่อความยืดหยุ่น"""
    encodings = ['utf-8-sig', 'utf-8', 'cp874', 'tis-620']
    separators = [',', ';', '\t']
    
    # ใช้ .name เพราะ Gradio ส่งมาเป็น object ที่มี path อยู่ใน .name
    filepath = csv_file.name 

    for encoding in encodings:
        for sep in separators:
            try:
                df = pd.read_csv(filepath, encoding=encoding, sep=sep, engine='python')
                if len(df.columns) > 1:
                    return df, None
            except Exception:
                continue
    return None, "ไม่สามารถอ่านไฟล์ CSV ได้ ลองตรวจสอบ Encoding (ควรเป็น UTF-8) และ Separator (ควรเป็น ,)"

def process_pdf_csv(pdf_file, csv_file, filename_column, file_prefix, use_form_fields, progress=gr.Progress()):
    """ฟังก์ชันหลักสำหรับประมวลผล PDF และ CSV"""
    if not pdf_file or not csv_file:
        return None, "❌ กรุณาอัพโหลดทั้งไฟล์ PDF และ CSV"

    try:
        df, csv_error = read_csv_safe(csv_file)
        if df is None:
            return None, f"❌ ไม่สามารถอ่าน CSV ได้: {csv_error}"

        pdf_path = pdf_file.name
        pdf_fields = analyze_pdf_fields(pdf_path)
        has_form_fields = bool(pdf_fields and "error" not in pdf_fields)
        
        generated_pdfs = {}
        log = []
        total_rows = len(df)

        for index, row in df.iterrows():
            progress((index + 1) / total_rows, f"ประมวลผลแถวที่ {index + 1}/{total_rows}")
            
            # สร้างชื่อไฟล์
            if filename_column and filename_column in df.columns and pd.notna(row[filename_column]):
                safe_name = "".join(c for c in str(row[filename_column]) if c.isalnum() or c in (' ', '-', '_')).strip()
                filename = f"{file_prefix}_{safe_name}.pdf"
            else:
                filename = f"{file_prefix}_{index + 1:03d}.pdf"
            
            row_data = row.to_dict()
            
            try:
                if use_form_fields and has_form_fields:
                    # เติมฟอร์ม PDF ที่มีอยู่
                    pdf_content = fill_pdf_form(pdf_path, row_data)
                    status = "เติมฟอร์มสำเร็จ"
                else:
                    # สร้าง PDF ใหม่
                    pdf_content = create_simple_pdf(row_data, filename)
                    status = "สร้าง PDF ใหม่" if not has_form_fields else "สร้าง PDF ใหม่ (Fallback)"
                
                generated_pdfs[filename] = pdf_content
                log.append(f"✅ {filename}: {status}")
            except Exception as e:
                log.append(f"❌ {filename}: เกิดข้อผิดพลาด - {e}")
        
        if not generated_pdfs:
            return None, "❌ ไม่สามารถสร้าง PDF ได้เลย\n" + "\n".join(log)

        # สร้างไฟล์ ZIP
        zip_buffer = io.BytesIO()
        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_f:
            for filename, pdf_content in generated_pdfs.items():
                zip_f.writestr(filename, pdf_content)
            zip_f.writestr("processing_log.txt", "\n".join(log))
        zip_buffer.seek(0)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        zip_filename = f"generated_pdfs_{timestamp}.zip"
        temp_zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
        with open(temp_zip_path, 'wb') as f:
            f.write(zip_buffer.getvalue())
            
        result_message = f"✅ สร้าง PDF สำเร็จ {len(generated_pdfs)} ไฟล์!\nดูรายละเอียดใน processing_log.txt"
        return temp_zip_path, result_message

    except Exception as e:
        return None, f"❌ เกิดข้อผิดพลาดร้ายแรง: {e}\n{traceback.format_exc()}"


# ==============================================================================
# ส่วนของฟังก์ชัน AI และ OCR (ทางเลือก)
# ==============================================================================

def init_sambanova_ai():
    """Initialize SambaNova AI model client."""
    if not SAMBANOVA_AI_ENABLED:
        print("SambaNova AI is disabled.")
        return None
    try:
        # ใช้ gradio_client.Client แทน gr.load ที่อาจมีปัญหา
        client = Client("sambanova/Llama-3-8B-Instruct", hf_token="YOUR_HF_TOKEN") # ใส่ Hugging Face Token ของคุณ
        print("SambaNova AI client initialized successfully.")
        return client
    except Exception as e:
        print(f"Error initializing SambaNova AI: {e}")
        return None

def extract_text_from_image(image_file):
    """Extract text from an image file using Tesseract OCR."""
    if not AI_OCR_ENABLED or image_file is None:
        return "", "OCR is not available or no image provided."
    try:
        image = Image.open(image_file.name)
        # ตั้งค่า Tesseract ให้ตรวจจับทั้งภาษาไทยและอังกฤษ
        custom_config = r'--oem 3 --psm 6 -l tha+eng'
        text = pytesseract.image_to_string(image, config=custom_config)
        return text.strip(), "Text extracted successfully."
    except Exception as e:
        return "", f"OCR Error: {e}. ตรวจสอบว่าติดตั้ง Tesseract Engine ถูกต้อง"

def image_to_csv_with_ai(image_file, progress=gr.Progress()):
    """Convert data from an image to a CSV file using OCR and AI for structuring."""
    if not AI_OCR_ENABLED:
        return None, "❌ ฟังก์ชันนี้ต้องการ AI/OCR dependencies"
    if image_file is None:
        return None, "❌ กรุณาอัพโหลดรูปภาพ"
        
    progress(0.2, desc="กำลังอ่านข้อความจากรูปภาพ (OCR)...")
    raw_text, ocr_status = extract_text_from_image(image_file)
    if not raw_text:
        return None, f"❌ ไม่พบข้อความในรูปภาพ: {ocr_status}"

    progress(0.5, desc="กำลังใช้ AI จัดโครงสร้างข้อมูล...")
    ai_client = init_sambanova_ai()
    if not ai_client:
        return None, "❌ ไม่สามารถเชื่อมต่อ AI Model ได้"

    prompt = f"""
From the following text, extract key-value pairs. The output should be only the data in 'key: value' format, one per line.
Example:
Name: John Doe
Address: 123 Main St
Date: 2024-01-15

Text to process:
---
{raw_text}
---
"""
    try:
        # การเรียกใช้งาน API ของ gradio_client
        result = ai_client.predict(message=prompt, api_name="/chat")
        
        progress(0.8, desc="กำลังสร้างไฟล์ CSV...")
        lines = result.strip().split('\n')
        data = [line.split(':', 1) for line in lines if ':' in line]
        
        if not data:
            return None, "AI ไม่สามารถจัดโครงสร้างข้อมูลได้"
            
        df = pd.DataFrame(data, columns=['Field', 'Value']).set_index('Field').T
        
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False, encoding='utf-8-sig')
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        temp_csv_path = os.path.join(tempfile.gettempdir(), f"extracted_data_{timestamp}.csv")
        with open(temp_csv_path, 'w', encoding='utf-8-sig') as f:
            f.write(csv_buffer.getvalue())
            
        return temp_csv_path, "✅ แปลงรูปภาพเป็น CSV สำเร็จ"
    except Exception as e:
        return None, f"❌ เกิดข้อผิดพลาดระหว่างประมวลผลด้วย AI: {e}"


# ==============================================================================
# ส่วนของ UI Analysis Functions
# ==============================================================================

def analyze_pdf_info(pdf_file):
    """วิเคราะห์และแสดงข้อมูลสรุปของไฟล์ PDF บน UI"""
    if pdf_file is None:
        return "ยังไม่มีไฟล์ PDF"
    try:
        reader = PdfReader(pdf_file.name)
        info = f"📄 **ข้อมูล PDF:**\n- จำนวนหน้า: {len(reader.pages)}\n"
        pdf_fields = analyze_pdf_fields(pdf_file.name)
        if pdf_fields and "error" not in pdf_fields:
            info += f"- **พบ Form Fields: {len(pdf_fields)} ช่อง** (จะใช้วิธีเติมฟอร์ม)\n"
            info += "\n🏷️ **ตัวอย่างชื่อ Fields:**\n"
            for name in list(pdf_fields.keys())[:10]:
                info += f"  - `{name}`\n"
            if len(pdf_fields) > 10:
                info += f"  - ... และอีก {len(pdf_fields) - 10} fields\n"
        else:
            info += "- **ไม่พบ Form Fields** (จะใช้วิธีสร้าง PDF ใหม่ทับลงบนกระดาษเปล่า)\n"
        return info
    except Exception as e:
        return f"❌ ไม่สามารถวิเคราะห์ PDF: {e}"

def analyze_csv_info(csv_file):
    """วิเคราะห์และแสดงข้อมูลสรุปของไฟล์ CSV และอัปเดต Dropdown"""
    if csv_file is None:
        return "ยังไม่มีไฟล์ CSV", gr.update(choices=[], value=None)
    try:
        df, error = read_csv_safe(csv_file)
        if df is None:
            return f"❌ ไม่สามารถอ่าน CSV: {error}", gr.update(choices=[], value=None)
        
        info = f"📋 **ข้อมูล CSV:**\n- จำนวนแถว: {len(df)}\n- จำนวนคอลัมน์: {len(df.columns)}\n"
        info += "\n📝 **รายชื่อคอลัมน์:**\n"
        for col in df.columns[:15]:
            info += f"  - `{col}`\n"
        if len(df.columns) > 15:
            info += f"  - ... และอีก {len(df.columns) - 15} คอลัมน์\n"
        
        # อัปเดต Dropdown สำหรับเลือกคอลัมน์ชื่อไฟล์
        return info, gr.update(choices=df.columns.tolist(), value=None)
    except Exception as e:
        return f"❌ ไม่สามารถวิเคราะห์ CSV: {e}", gr.update(choices=[], value=None)


# ==============================================================================
# ส่วนของการสร้าง Gradio Interface
# ==============================================================================

def create_interface():
    with gr.Blocks(title="PDF Form Filler & Template Generator", theme=gr.themes.Soft()) as app:
        gr.Markdown("# 📄 เครื่องมือจัดการ PDF จากข้อมูล CSV")
        gr.Markdown("รองรับการ **สร้าง Template** จาก PDF, **เติมข้อมูล** จาก CSV, และ **แปลงรูปภาพเป็น CSV** ด้วย AI")

        with gr.Tabs():
            # --- Tab 1: สร้าง Template ---
            with gr.TabItem("🔄 1. สร้าง Template"):
                gr.Markdown("## สร้าง CSV/JSON Template จาก PDF ที่มี Form Fields")
                with gr.Row():
                    with gr.Column(scale=1):
                        template_pdf = gr.File(label="📄 อัพโหลด PDF ต้นฉบับ", file_types=[".pdf"])
                        num_sample_rows = gr.Slider(label="จำนวนแถวตัวอย่างใน CSV", minimum=1, maximum=50, value=5, step=1)
                        generate_template_btn = gr.Button("🚀 สร้าง Template", variant="primary")
                    with gr.Column(scale=2):
                        template_pdf_info = gr.Markdown("อัพโหลด PDF เพื่อดูข้อมูล...")
                        template_result_file = gr.File(label="📦 ดาวน์โหลดไฟล์ Template (ZIP)", interactive=False)
                        template_result_message = gr.Markdown()

            # --- Tab 2: เติมข้อมูล PDF ---
            with gr.TabItem("📝 2. เติมข้อมูล PDF"):
                gr.Markdown("## เติมข้อมูลลงใน PDF จากไฟล์ CSV")
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### 📂 1. อัพโหลดไฟล์")
                        pdf_file = gr.File(label="📄 PDF Form ต้นฉบับ", file_types=[".pdf"])
                        csv_file = gr.File(label="📊 CSV ข้อมูล", file_types=[".csv"])
                        
                        gr.Markdown("### ⚙️ 2. ตั้งค่า")
                        use_form_fields = gr.Checkbox(label="พยายามเติมข้อมูลลงใน Form Fields ที่มีอยู่", value=True)
                        file_prefix = gr.Textbox(label="คำนำหน้าชื่อไฟล์ (Prefix)", value="Document")
                        filename_column = gr.Dropdown(label="เลือกคอลัมน์ที่จะใช้เป็นชื่อไฟล์ (ถ้ามี)", interactive=True)
                        
                        fill_form_btn = gr.Button("🚀 เริ่มเติมข้อมูล", variant="primary")
                        
                    with gr.Column(scale=2):
                        pdf_info = gr.Markdown("อัพโหลด PDF เพื่อดูข้อมูล...")
                        csv_info = gr.Markdown("อัพโหลด CSV เพื่อดูข้อมูล...")
                        gr.Markdown("---")
                        filled_result_file = gr.File(label="📦 ดาวน์โหลด PDF ทั้งหมด (ZIP)", interactive=False)
                        filled_result_message = gr.Markdown()

            # --- Tab 3: Image to CSV (AI) ---
            with gr.TabItem("🖼️ 3. แปลงรูปภาพเป็น CSV (AI)"):
                gr.Markdown("## ใช้ OCR และ AI เพื่อดึงข้อมูลจากรูปภาพและสร้างเป็นไฟล์ CSV")
                with gr.Row():
                    with gr.Column(scale=1):
                        image_upload = gr.File(label="🖼️ อัพโหลดรูปภาพ (บิล, เอกสาร, ฯลฯ)", file_types=["image"])
                        image_to_csv_btn = gr.Button("🤖 แปลงเป็น CSV", variant="primary", visible=AI_OCR_ENABLED)
                        if not AI_OCR_ENABLED:
                            gr.Markdown("⚠️ *ฟังก์ชันนี้ถูกปิดใช้งานเนื่องจากไม่พบ Library ที่จำเป็น (Pillow, OpenCV, Pytesseract)*")

                    with gr.Column(scale=2):
                        image_csv_output = gr.File(label="📄 ดาวน์โหลดไฟล์ CSV ที่ได้", interactive=False)
                        image_csv_message = gr.Markdown()

        # --- Event Handlers ---
        template_pdf.change(fn=analyze_pdf_info, inputs=template_pdf, outputs=template_pdf_info)
        generate_template_btn.click(
            fn=create_template_files,
            inputs=[template_pdf, num_sample_rows],
            outputs=[template_result_file, template_result_message]
        )

        pdf_file.change(fn=analyze_pdf_info, inputs=pdf_file, outputs=pdf_info)
        csv_file.change(fn=analyze_csv_info, inputs=csv_file, outputs=[csv_info, filename_column])
        
        fill_form_btn.click(
            fn=process_pdf_csv,
            inputs=[pdf_file, csv_file, filename_column, file_prefix, use_form_fields],
            outputs=[filled_result_file, filled_result_message]
        )

        if AI_OCR_ENABLED:
            image_to_csv_btn.click(
                fn=image_to_csv_with_ai,
                inputs=[image_upload],
                outputs=[image_csv_output, image_csv_message]
            )
            
    return app

# --- Launch the application ---
if __name__ == "__main__":
    # ลองหา font ไทย ถ้าไม่มีจะได้ไม่ error ตอนสร้าง PDF
    try:
        from reportlab.pdfbase import pdfmetrics
        from reportlab.pdfbase.ttfonts import TTFont
        # สำหรับ Windows
        pdfmetrics.registerFont(TTFont('THSarabunNew', 'C:/Windows/Fonts/THSARI.TTF'))
        print("ลงทะเบียน Font 'THSarabunNew' สำหรับ ReportLab สำเร็จ")
    except:
        print("คำเตือน: ไม่พบ Font 'THSarabunNew' ในระบบ อาจทำให้การสร้าง PDF ภาษาไทยมีปัญหา")
        print("แนะนำให้ติดตั้งฟอนต์ TH SarabunPSK หรือปรับแก้ path ของฟอนต์ในโค้ด")

    app = create_interface()
    app.launch(debug=True)