|
|
import os |
|
|
import json |
|
|
import pytesseract |
|
|
from pathlib import Path |
|
|
from PIL import Image |
|
|
from pdf2image import convert_from_path |
|
|
from langchain_ollama import OllamaLLM |
|
|
|
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' |
|
|
|
|
|
|
|
|
INPUT_DIR = "scans" |
|
|
HISTORY_FILE = "processed_real_scans_files.txt" |
|
|
MODEL_NAME = "llama3" |
|
|
|
|
|
|
|
|
TARGET_LANGUAGES = { |
|
|
"pl": "Polish", |
|
|
"en": "English", |
|
|
"de": "German", |
|
|
"fr": "French", |
|
|
"es": "Spanish", |
|
|
"it": "Italian", |
|
|
"uk": "Ukrainian" |
|
|
} |
|
|
|
|
|
llm = OllamaLLM(model=MODEL_NAME, temperature=0) |
|
|
|
|
|
|
|
|
ALLOWED_TYPES = [ |
|
|
|
|
|
"taxDocument", "invoice", "receipt", "utilityBill", "bankStatement", |
|
|
"loanAgreement", "insurancePolicy", |
|
|
|
|
|
|
|
|
"notarialDeed", "courtDocument", "powerOfAttorney", "contract", |
|
|
|
|
|
|
|
|
"idCard", "passport", "birthCertificate", "marriageCertificate", |
|
|
"deathCertificate", "officialCertificate", "drivingLicense", |
|
|
"educationDocument", "cv", |
|
|
|
|
|
|
|
|
"medicalDocument", "prescription", "referral", "vaccinationCard", |
|
|
"sanitaryBooklet", |
|
|
|
|
|
|
|
|
"propertyDeed", "rentalAgreement", "vehicleDocument", "technicalInspection", |
|
|
|
|
|
|
|
|
"documentScan", "application", "certificate", "other" |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
def load_history(): |
|
|
"""Wczytuje listę przetworzonych plików do setu (dla szybkiego wyszukiwania).""" |
|
|
if not os.path.exists(HISTORY_FILE): |
|
|
return set() |
|
|
with open(HISTORY_FILE, 'r', encoding='utf-8') as f: |
|
|
return set(line.strip() for line in f if line.strip()) |
|
|
|
|
|
|
|
|
def mark_as_done(rel_path): |
|
|
"""Dopisuje plik do historii.""" |
|
|
with open(HISTORY_FILE, 'a', encoding='utf-8') as f: |
|
|
f.write(f"{rel_path}\n") |
|
|
|
|
|
|
|
|
|
|
|
def perform_ocr(file_path): |
|
|
text = "" |
|
|
try: |
|
|
langs = 'pol+eng' |
|
|
if file_path.suffix.lower() == ".pdf": |
|
|
pages = convert_from_path(file_path) |
|
|
for page in pages: |
|
|
text += pytesseract.image_to_string(page, lang=langs) |
|
|
else: |
|
|
text = pytesseract.image_to_string(Image.open(file_path), lang=langs) |
|
|
except Exception as e: |
|
|
print(f" [!] Błąd OCR: {file_path.name}: {e}") |
|
|
return text |
|
|
|
|
|
|
|
|
def ask_llm_json(prompt): |
|
|
try: |
|
|
response = llm.invoke(prompt) |
|
|
clean = response.replace("```json", "").replace("```", "").strip() |
|
|
start, end = clean.find('{'), clean.rfind('}') + 1 |
|
|
return json.loads(clean[start:end]) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
|
|
|
def ask_llm_text(prompt): |
|
|
try: |
|
|
response = llm.invoke(prompt) |
|
|
return response.strip().strip('"').strip("'") |
|
|
except Exception: |
|
|
return "Translation Error" |
|
|
|
|
|
|
|
|
|
|
|
def get_core_metadata(text, hinted_type=None): |
|
|
print(" 🧠 Analiza struktury dokumentu (Core Metadata)...") |
|
|
|
|
|
|
|
|
hint_str = "" |
|
|
if hinted_type in ALLOWED_TYPES: |
|
|
hint_str = f"Strong Hint: The document is likely located in folder '{hinted_type}'." |
|
|
|
|
|
prompt = f""" |
|
|
Analyze the following document text. |
|
|
{hint_str} |
|
|
|
|
|
Extract structured data. |
|
|
RULES: |
|
|
1. 'summary_base': Write a factual summary in ENGLISH (5 sentences). |
|
|
2. 'title_base': Write a title in ENGLISH format: "[Specific Type] - [Entity] - [Date]". |
|
|
(e.g., "Tax Document (PIT-11) - Employer Name - 2023") |
|
|
3. 'category': Must be one of: financial, legal, personal, health, property, other. |
|
|
4. 'type': Choose the BEST MATCH from this specific list: {", ".join(ALLOWED_TYPES)}. |
|
|
5. 'info': Specific details (e.g. "PIT-11", "Umowa o pracę", "Prąd"). |
|
|
|
|
|
Return ONLY JSON: |
|
|
{{ |
|
|
"title_base": "...", |
|
|
"summary_base": "...", |
|
|
"category": "...", |
|
|
"type": "...", |
|
|
"info": "..." |
|
|
}} |
|
|
|
|
|
TEXT: |
|
|
{text[:4000]} |
|
|
""" |
|
|
return ask_llm_json(prompt) |
|
|
|
|
|
|
|
|
def translate_section(text, target_lang, content_type="text"): |
|
|
prompt = f""" |
|
|
Translate the following {content_type} into {target_lang}. |
|
|
Output ONLY the translation. No explanations. No markdown. |
|
|
|
|
|
TEXT TO TRANSLATE: |
|
|
{text} |
|
|
""" |
|
|
return ask_llm_text(prompt) |
|
|
|
|
|
|
|
|
def save_file(root_folder, lang_code, sub_dir, filename, content): |
|
|
path = Path(root_folder) / lang_code / sub_dir |
|
|
path.mkdir(parents=True, exist_ok=True) |
|
|
with open(path / filename, "w", encoding="utf-8") as f: |
|
|
f.write(str(content)) |
|
|
|
|
|
|
|
|
def save_meta(root_folder, sub_dir, filename, content): |
|
|
path = Path(root_folder) / sub_dir |
|
|
path.mkdir(parents=True, exist_ok=True) |
|
|
with open(path / filename, "w", encoding="utf-8") as f: |
|
|
f.write(str(content)) |
|
|
|
|
|
|
|
|
def process_file(file_path, input_root): |
|
|
rel_path = file_path.relative_to(input_root) |
|
|
rel_path_str = str(rel_path) |
|
|
|
|
|
base_filename = rel_path.stem + ".txt" |
|
|
sub_dir = rel_path.parent |
|
|
hinted_type = sub_dir.name if sub_dir.name != input_root.name else None |
|
|
|
|
|
|
|
|
raw_text = perform_ocr(file_path) |
|
|
|
|
|
if not raw_text.strip(): |
|
|
print(" ⚠️ Pusty OCR - oznaczam jako przetworzony (bez wyników).") |
|
|
mark_as_done(rel_path_str) |
|
|
return |
|
|
|
|
|
|
|
|
save_meta("content", sub_dir, base_filename, raw_text) |
|
|
|
|
|
|
|
|
core_data = get_core_metadata(raw_text, hinted_type) |
|
|
|
|
|
if not core_data: |
|
|
print(" ❌ Błąd analizy AI. Przerywam dla tego pliku.") |
|
|
return |
|
|
|
|
|
|
|
|
save_meta("category", sub_dir, base_filename, core_data.get("category", "other")) |
|
|
save_meta("type", sub_dir, base_filename, core_data.get("type", "other")) |
|
|
save_meta("info", sub_dir, base_filename, core_data.get("info", "none")) |
|
|
|
|
|
base_title = core_data.get("title_base", "Document") |
|
|
base_summary = core_data.get("summary_base", "No summary.") |
|
|
|
|
|
|
|
|
print(" 🌍 Rozpoczynam generowanie etykiet (tytuły/podsumowania)...") |
|
|
|
|
|
for code, lang_name in TARGET_LANGUAGES.items(): |
|
|
print(f" -> [{code.upper()}] {lang_name}...", end="", flush=True) |
|
|
|
|
|
|
|
|
if code == "en": |
|
|
final_title = base_title |
|
|
else: |
|
|
final_title = translate_section(base_title, lang_name, "title") |
|
|
save_file("titles", code, sub_dir, base_filename, final_title) |
|
|
|
|
|
|
|
|
if code == "en": |
|
|
final_summary = base_summary |
|
|
else: |
|
|
final_summary = translate_section(base_summary, lang_name, "summary") |
|
|
save_file("summary", code, sub_dir, base_filename, final_summary) |
|
|
|
|
|
|
|
|
|
|
|
print(" OK.") |
|
|
|
|
|
|
|
|
print(f"✅ Zakończono: {file_path.name}") |
|
|
mark_as_done(rel_path_str) |
|
|
|
|
|
|
|
|
def main(): |
|
|
input_root = Path(INPUT_DIR) |
|
|
if not input_root.exists(): |
|
|
print(f"Brak folderu wejściowego: {INPUT_DIR}") |
|
|
return |
|
|
|
|
|
|
|
|
processed_files = load_history() |
|
|
print(f"📂 Załadowano historię: {len(processed_files)} plików już przetworzonych.") |
|
|
|
|
|
all_files = [f for f in input_root.rglob("*") if |
|
|
f.is_file() and f.suffix.lower() in [".pdf", ".jpg", ".png", ".jpeg"]] |
|
|
print(f"🚀 Znaleziono łącznie {len(all_files)} plików do analizy.") |
|
|
|
|
|
for f in all_files: |
|
|
rel_path_str = str(f.relative_to(input_root)) |
|
|
|
|
|
|
|
|
if rel_path_str in processed_files: |
|
|
print(f"⏩ Pomijam (już w historii): {rel_path_str}") |
|
|
continue |
|
|
|
|
|
print(f"\n📄 Przetwarzanie: {rel_path_str}") |
|
|
try: |
|
|
process_file(f, input_root) |
|
|
except KeyboardInterrupt: |
|
|
print("\n🛑 Zatrzymano przez użytkownika. Postęp zapisany.") |
|
|
break |
|
|
except Exception as e: |
|
|
print(f"\n❌ Krytyczny błąd dla {rel_path_str}: {e}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |