File size: 8,224 Bytes

8fd4eb2

import os
import json
import pytesseract
from pathlib import Path
from PIL import Image
from pdf2image import convert_from_path
from langchain_ollama import OllamaLLM

# --- KONFIGURACJA ---
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

# Folder wejściowy
INPUT_DIR = "scans"
HISTORY_FILE = "processed_real_scans_files.txt"  # Plik z listą zrobionych skanów
MODEL_NAME = "llama3"

# Definicja języków
TARGET_LANGUAGES = {
    "pl": "Polish",
    "en": "English",
    "de": "German",
    "fr": "French",
    "es": "Spanish",
    "it": "Italian",
    "uk": "Ukrainian"
}

llm = OllamaLLM(model=MODEL_NAME, temperature=0)

# NOWA, SKONSOLIDOWANA LISTA TYPÓW (zgodna z nowym Enumem)
ALLOWED_TYPES = [
    # Financial
    "taxDocument", "invoice", "receipt", "utilityBill", "bankStatement",
    "loanAgreement", "insurancePolicy",

    # Legal
    "notarialDeed", "courtDocument", "powerOfAttorney", "contract",

    # Personal
    "idCard", "passport", "birthCertificate", "marriageCertificate",
    "deathCertificate", "officialCertificate", "drivingLicense",
    "educationDocument", "cv",

    # Health
    "medicalDocument", "prescription", "referral", "vaccinationCard",
    "sanitaryBooklet",

    # Property
    "propertyDeed", "rentalAgreement", "vehicleDocument", "technicalInspection",

    # Other
    "documentScan", "application", "certificate", "other"
]


# --- OBSŁUGA HISTORII (RESUME) ---
def load_history():
    """Wczytuje listę przetworzonych plików do setu (dla szybkiego wyszukiwania)."""
    if not os.path.exists(HISTORY_FILE):
        return set()
    with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
        return set(line.strip() for line in f if line.strip())


def mark_as_done(rel_path):
    """Dopisuje plik do historii."""
    with open(HISTORY_FILE, 'a', encoding='utf-8') as f:
        f.write(f"{rel_path}\n")


# --- OCR I LLM ---
def perform_ocr(file_path):
    text = ""
    try:
        langs = 'pol+eng'
        if file_path.suffix.lower() == ".pdf":
            pages = convert_from_path(file_path)
            for page in pages:
                text += pytesseract.image_to_string(page, lang=langs)
        else:
            text = pytesseract.image_to_string(Image.open(file_path), lang=langs)
    except Exception as e:
        print(f"  [!] Błąd OCR: {file_path.name}: {e}")
    return text


def ask_llm_json(prompt):
    try:
        response = llm.invoke(prompt)
        clean = response.replace("```json", "").replace("```", "").strip()
        start, end = clean.find('{'), clean.rfind('}') + 1
        return json.loads(clean[start:end])
    except Exception:
        return None


def ask_llm_text(prompt):
    try:
        response = llm.invoke(prompt)
        return response.strip().strip('"').strip("'")
    except Exception:
        return "Translation Error"


# --- LOGIKA PRZETWARZANIA ---
def get_core_metadata(text, hinted_type=None):
    print("   🧠 Analiza struktury dokumentu (Core Metadata)...")

    # Jeśli folder sugeruje typ, przekaż go jako wskazówkę
    hint_str = ""
    if hinted_type in ALLOWED_TYPES:
        hint_str = f"Strong Hint: The document is likely located in folder '{hinted_type}'."

    prompt = f"""
    Analyze the following document text.
    {hint_str}

    Extract structured data.
    RULES:
    1. 'summary_base': Write a factual summary in ENGLISH (5 sentences).
    2. 'title_base': Write a title in ENGLISH format: "[Specific Type] - [Entity] - [Date]". 
       (e.g., "Tax Document (PIT-11) - Employer Name - 2023")
    3. 'category': Must be one of: financial, legal, personal, health, property, other.
    4. 'type': Choose the BEST MATCH from this specific list: {", ".join(ALLOWED_TYPES)}.
    5. 'info': Specific details (e.g. "PIT-11", "Umowa o pracę", "Prąd").

    Return ONLY JSON:
    {{
        "title_base": "...",
        "summary_base": "...",
        "category": "...",
        "type": "...",
        "info": "..."
    }}

    TEXT:
    {text[:4000]} 
    """
    return ask_llm_json(prompt)


def translate_section(text, target_lang, content_type="text"):
    prompt = f"""
    Translate the following {content_type} into {target_lang}.
    Output ONLY the translation. No explanations. No markdown.

    TEXT TO TRANSLATE:
    {text}
    """
    return ask_llm_text(prompt)


def save_file(root_folder, lang_code, sub_dir, filename, content):
    path = Path(root_folder) / lang_code / sub_dir
    path.mkdir(parents=True, exist_ok=True)
    with open(path / filename, "w", encoding="utf-8") as f:
        f.write(str(content))


def save_meta(root_folder, sub_dir, filename, content):
    path = Path(root_folder) / sub_dir
    path.mkdir(parents=True, exist_ok=True)
    with open(path / filename, "w", encoding="utf-8") as f:
        f.write(str(content))


def process_file(file_path, input_root):
    rel_path = file_path.relative_to(input_root)
    rel_path_str = str(rel_path)  # Klucz do pliku historii

    base_filename = rel_path.stem + ".txt"
    sub_dir = rel_path.parent
    hinted_type = sub_dir.name if sub_dir.name != input_root.name else None

    # 1. OCR
    raw_text = perform_ocr(file_path)

    if not raw_text.strip():
        print("   ⚠️ Pusty OCR - oznaczam jako przetworzony (bez wyników).")
        mark_as_done(rel_path_str)
        return

    # Zapisz oryginał (Content) - to zostaje, bo to dane wejściowe
    save_meta("content", sub_dir, base_filename, raw_text)

    # 2. Analiza podstawowa (Core)
    core_data = get_core_metadata(raw_text, hinted_type)

    if not core_data:
        print("   ❌ Błąd analizy AI. Przerywam dla tego pliku.")
        return

    # Zapisz dane niezależne od języka
    save_meta("category", sub_dir, base_filename, core_data.get("category", "other"))
    save_meta("type", sub_dir, base_filename, core_data.get("type", "other"))
    save_meta("info", sub_dir, base_filename, core_data.get("info", "none"))

    base_title = core_data.get("title_base", "Document")
    base_summary = core_data.get("summary_base", "No summary.")

    # 3. Pętla Tłumaczeń (TYLKO ETYKIETY)
    print("   🌍 Rozpoczynam generowanie etykiet (tytuły/podsumowania)...")

    for code, lang_name in TARGET_LANGUAGES.items():
        print(f"      -> [{code.upper()}] {lang_name}...", end="", flush=True)

        # A. Tytuł
        if code == "en":
            final_title = base_title
        else:
            final_title = translate_section(base_title, lang_name, "title")
        save_file("titles", code, sub_dir, base_filename, final_title)

        # B. Streszczenie
        if code == "en":
            final_summary = base_summary
        else:
            final_summary = translate_section(base_summary, lang_name, "summary")
        save_file("summary", code, sub_dir, base_filename, final_summary)

        # C. Pełna treść - USUNIĘTO (Oszczędność czasu i tokenów)
        
        print(" OK.")

    # SUKCES! Dopiero tutaj zapisujemy do historii
    print(f"✅ Zakończono: {file_path.name}")
    mark_as_done(rel_path_str)


def main():
    input_root = Path(INPUT_DIR)
    if not input_root.exists():
        print(f"Brak folderu wejściowego: {INPUT_DIR}")
        return

    # Wczytaj historię
    processed_files = load_history()
    print(f"📂 Załadowano historię: {len(processed_files)} plików już przetworzonych.")

    all_files = [f for f in input_root.rglob("*") if
                 f.is_file() and f.suffix.lower() in [".pdf", ".jpg", ".png", ".jpeg"]]
    print(f"🚀 Znaleziono łącznie {len(all_files)} plików do analizy.")

    for f in all_files:
        rel_path_str = str(f.relative_to(input_root))

        # Sprawdzenie w historii
        if rel_path_str in processed_files:
            print(f"⏩ Pomijam (już w historii): {rel_path_str}")
            continue

        print(f"\n📄 Przetwarzanie: {rel_path_str}")
        try:
            process_file(f, input_root)
        except KeyboardInterrupt:
            print("\n🛑 Zatrzymano przez użytkownika. Postęp zapisany.")
            break
        except Exception as e:
            print(f"\n❌ Krytyczny błąd dla {rel_path_str}: {e}")


if __name__ == "__main__":
    main()