paperstack_document_data_retrieval / retrieve_multilang.py

Add files using upload-large-folder tool

8fd4eb2 verified 22 days ago

8.22 kB

	import os
	import json
	import pytesseract
	from pathlib import Path
	from PIL import Image
	from pdf2image import convert_from_path
	from langchain_ollama import OllamaLLM

	# --- KONFIGURACJA ---
	pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

	# Folder wejściowy
	INPUT_DIR = "scans"
	HISTORY_FILE = "processed_real_scans_files.txt" # Plik z listą zrobionych skanów
	MODEL_NAME = "llama3"

	# Definicja języków
	TARGET_LANGUAGES = {
	"pl": "Polish",
	"en": "English",
	"de": "German",
	"fr": "French",
	"es": "Spanish",
	"it": "Italian",
	"uk": "Ukrainian"
	}

	llm = OllamaLLM(model=MODEL_NAME, temperature=0)

	# NOWA, SKONSOLIDOWANA LISTA TYPÓW (zgodna z nowym Enumem)
	ALLOWED_TYPES = [
	# Financial
	"taxDocument", "invoice", "receipt", "utilityBill", "bankStatement",
	"loanAgreement", "insurancePolicy",

	# Legal
	"notarialDeed", "courtDocument", "powerOfAttorney", "contract",

	# Personal
	"idCard", "passport", "birthCertificate", "marriageCertificate",
	"deathCertificate", "officialCertificate", "drivingLicense",
	"educationDocument", "cv",

	# Health
	"medicalDocument", "prescription", "referral", "vaccinationCard",
	"sanitaryBooklet",

	# Property
	"propertyDeed", "rentalAgreement", "vehicleDocument", "technicalInspection",

	# Other
	"documentScan", "application", "certificate", "other"
	]


	# --- OBSŁUGA HISTORII (RESUME) ---
	def load_history():
	"""Wczytuje listę przetworzonych plików do setu (dla szybkiego wyszukiwania)."""
	if not os.path.exists(HISTORY_FILE):
	return set()
	with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
	return set(line.strip() for line in f if line.strip())


	def mark_as_done(rel_path):
	"""Dopisuje plik do historii."""
	with open(HISTORY_FILE, 'a', encoding='utf-8') as f:
	f.write(f"{rel_path}\n")


	# --- OCR I LLM ---
	def perform_ocr(file_path):
	text = ""
	try:
	langs = 'pol+eng'
	if file_path.suffix.lower() == ".pdf":
	pages = convert_from_path(file_path)
	for page in pages:
	text += pytesseract.image_to_string(page, lang=langs)
	else:
	text = pytesseract.image_to_string(Image.open(file_path), lang=langs)
	except Exception as e:
	print(f" [!] Błąd OCR: {file_path.name}: {e}")
	return text


	def ask_llm_json(prompt):
	try:
	response = llm.invoke(prompt)
	clean = response.replace("```json", "").replace("```", "").strip()
	start, end = clean.find('{'), clean.rfind('}') + 1
	return json.loads(clean[start:end])
	except Exception:
	return None


	def ask_llm_text(prompt):
	try:
	response = llm.invoke(prompt)
	return response.strip().strip('"').strip("'")
	except Exception:
	return "Translation Error"


	# --- LOGIKA PRZETWARZANIA ---
	def get_core_metadata(text, hinted_type=None):
	print(" 🧠 Analiza struktury dokumentu (Core Metadata)...")

	# Jeśli folder sugeruje typ, przekaż go jako wskazówkę
	hint_str = ""
	if hinted_type in ALLOWED_TYPES:
	hint_str = f"Strong Hint: The document is likely located in folder '{hinted_type}'."

	prompt = f"""
	Analyze the following document text.
	{hint_str}

	Extract structured data.
	RULES:
	1. 'summary_base': Write a factual summary in ENGLISH (5 sentences).
	2. 'title_base': Write a title in ENGLISH format: "[Specific Type] - [Entity] - [Date]".
	(e.g., "Tax Document (PIT-11) - Employer Name - 2023")
	3. 'category': Must be one of: financial, legal, personal, health, property, other.
	4. 'type': Choose the BEST MATCH from this specific list: {", ".join(ALLOWED_TYPES)}.
	5. 'info': Specific details (e.g. "PIT-11", "Umowa o pracę", "Prąd").

	Return ONLY JSON:
	{{
	"title_base": "...",
	"summary_base": "...",
	"category": "...",
	"type": "...",
	"info": "..."
	}}

	TEXT:
	{text[:4000]}
	"""
	return ask_llm_json(prompt)


	def translate_section(text, target_lang, content_type="text"):
	prompt = f"""
	Translate the following {content_type} into {target_lang}.
	Output ONLY the translation. No explanations. No markdown.

	TEXT TO TRANSLATE:
	{text}
	"""
	return ask_llm_text(prompt)


	def save_file(root_folder, lang_code, sub_dir, filename, content):
	path = Path(root_folder) / lang_code / sub_dir
	path.mkdir(parents=True, exist_ok=True)
	with open(path / filename, "w", encoding="utf-8") as f:
	f.write(str(content))


	def save_meta(root_folder, sub_dir, filename, content):
	path = Path(root_folder) / sub_dir
	path.mkdir(parents=True, exist_ok=True)
	with open(path / filename, "w", encoding="utf-8") as f:
	f.write(str(content))


	def process_file(file_path, input_root):
	rel_path = file_path.relative_to(input_root)
	rel_path_str = str(rel_path) # Klucz do pliku historii

	base_filename = rel_path.stem + ".txt"
	sub_dir = rel_path.parent
	hinted_type = sub_dir.name if sub_dir.name != input_root.name else None

	# 1. OCR
	raw_text = perform_ocr(file_path)

	if not raw_text.strip():
	print(" ⚠️ Pusty OCR - oznaczam jako przetworzony (bez wyników).")
	mark_as_done(rel_path_str)
	return

	# Zapisz oryginał (Content) - to zostaje, bo to dane wejściowe
	save_meta("content", sub_dir, base_filename, raw_text)

	# 2. Analiza podstawowa (Core)
	core_data = get_core_metadata(raw_text, hinted_type)

	if not core_data:
	print(" ❌ Błąd analizy AI. Przerywam dla tego pliku.")
	return

	# Zapisz dane niezależne od języka
	save_meta("category", sub_dir, base_filename, core_data.get("category", "other"))
	save_meta("type", sub_dir, base_filename, core_data.get("type", "other"))
	save_meta("info", sub_dir, base_filename, core_data.get("info", "none"))

	base_title = core_data.get("title_base", "Document")
	base_summary = core_data.get("summary_base", "No summary.")

	# 3. Pętla Tłumaczeń (TYLKO ETYKIETY)
	print(" 🌍 Rozpoczynam generowanie etykiet (tytuły/podsumowania)...")

	for code, lang_name in TARGET_LANGUAGES.items():
	print(f" -> [{code.upper()}] {lang_name}...", end="", flush=True)

	# A. Tytuł
	if code == "en":
	final_title = base_title
	else:
	final_title = translate_section(base_title, lang_name, "title")
	save_file("titles", code, sub_dir, base_filename, final_title)

	# B. Streszczenie
	if code == "en":
	final_summary = base_summary
	else:
	final_summary = translate_section(base_summary, lang_name, "summary")
	save_file("summary", code, sub_dir, base_filename, final_summary)

	# C. Pełna treść - USUNIĘTO (Oszczędność czasu i tokenów)

	print(" OK.")

	# SUKCES! Dopiero tutaj zapisujemy do historii
	print(f"✅ Zakończono: {file_path.name}")
	mark_as_done(rel_path_str)


	def main():
	input_root = Path(INPUT_DIR)
	if not input_root.exists():
	print(f"Brak folderu wejściowego: {INPUT_DIR}")
	return

	# Wczytaj historię
	processed_files = load_history()
	print(f"📂 Załadowano historię: {len(processed_files)} plików już przetworzonych.")

	all_files = [f for f in input_root.rglob("*") if
	f.is_file() and f.suffix.lower() in [".pdf", ".jpg", ".png", ".jpeg"]]
	print(f"🚀 Znaleziono łącznie {len(all_files)} plików do analizy.")

	for f in all_files:
	rel_path_str = str(f.relative_to(input_root))

	# Sprawdzenie w historii
	if rel_path_str in processed_files:
	print(f"⏩ Pomijam (już w historii): {rel_path_str}")
	continue

	print(f"\n📄 Przetwarzanie: {rel_path_str}")
	try:
	process_file(f, input_root)
	except KeyboardInterrupt:
	print("\n🛑 Zatrzymano przez użytkownika. Postęp zapisany.")
	break
	except Exception as e:
	print(f"\n❌ Krytyczny błąd dla {rel_path_str}: {e}")


	if __name__ == "__main__":
	main()