import os import json from pathlib import Path from langchain_ollama import OllamaLLM # --- KONFIGURACJA --- INPUT_DIR = "synthetic_content" OUTPUT_ROOT = "synthetic_dataset" HISTORY_FILE = "processed_synthetic_scans_contents.txt" MODEL_NAME = "llama3" # Definicja języków TARGET_LANGUAGES = { "pl": "Polish", "en": "English", "de": "German", "fr": "French", "es": "Spanish", "it": "Italian", "uk": "Ukrainian" } # Inicjalizacja LLM z niską temperaturą dla powtarzalności llm = OllamaLLM(model=MODEL_NAME, temperature=0) # --- OBSŁUGA HISTORII (RESUME) --- def load_history(): if not os.path.exists(HISTORY_FILE): return set() with open(HISTORY_FILE, 'r', encoding='utf-8') as f: return set(line.strip() for line in f if line.strip()) def mark_as_done(rel_path): with open(HISTORY_FILE, 'a', encoding='utf-8') as f: f.write(f"{rel_path}\n") # --- PROMPTY LLM --- def ask_llm_json(prompt): """Wywołuje LLM w trybie JSON i bezpiecznie parsuje wynik.""" try: # format="json" to kluczowa funkcja Ollama, która wymusza poprawny JSON response = llm.invoke(prompt, format="json") return json.loads(response) except json.JSONDecodeError as e: print(f"\n ⚠️ Błąd składni JSON od AI: {e}") return None except Exception as e: print(f"\n ⚠️ Błąd komunikacji z LLM: {e}") return None def ask_llm_text(prompt): try: response = llm.invoke(prompt) return response.strip().strip('"').strip("'") except Exception: return "Translation Error" def get_metadata(text, hinted_type): # Prompt z wyraźnymi instrukcjami dla formatu JSON prompt = f""" Analyze this document text. Folder hint: {hinted_type} Return ONLY a JSON object with these keys: - "title_base": Factual title in ENGLISH (format: "[Type] - [Entity] - [Date]") - "summary_base": Factual summary in ENGLISH (exactly 5 sentences) - "category": One of: financial, legal, personal, health, property, other - "info": Key details (e.g. document ID or service name) Ensure all quotes inside the text are properly escaped. TEXT: {text[:3500]} """ return ask_llm_json(prompt) def translate_section(text, target_lang, content_type="text"): prompt = f""" Translate the following {content_type} into {target_lang}. Output ONLY the translation. No conversational text or markdown. TEXT TO TRANSLATE: {text} """ return ask_llm_text(prompt) def save_output(root, kind, lang, subdir, filename, content): if lang: path = Path(root) / kind / lang / subdir else: path = Path(root) / kind / subdir path.mkdir(parents=True, exist_ok=True) with open(path / filename, "w", encoding="utf-8") as f: f.write(str(content)) # --- GŁÓWNA LOGIKA PLIKU --- def process_file(file_path, input_root): rel_path = file_path.relative_to(input_root) base_filename = rel_path.name sub_dir = rel_path.parent doc_type = sub_dir.name try: raw_text = file_path.read_text(encoding='utf-8') except Exception as e: print(f" ❌ Błąd odczytu pliku: {e}") return # 2. Generowanie metadanych (JSON) meta = get_metadata(raw_text, doc_type) if not meta or not isinstance(meta, dict): print(" ❌ Błąd AI: Nie udało się wygenerować poprawnego JSONa.") return # 3. Zapisywanie danych podstawowych save_output(OUTPUT_ROOT, "content", None, sub_dir, base_filename, raw_text) save_output(OUTPUT_ROOT, "category", None, sub_dir, base_filename, meta.get("category", "other")) save_output(OUTPUT_ROOT, "type", None, sub_dir, base_filename, doc_type) save_output(OUTPUT_ROOT, "info", None, sub_dir, base_filename, meta.get("info", "none")) base_title = meta.get("title_base", "Document") base_summary = meta.get("summary_base", "No summary available.") # 4. Tłumaczenia print(f" 🌍 Tłumaczenie na {len(TARGET_LANGUAGES)} języków...", end="", flush=True) for code, lang_name in TARGET_LANGUAGES.items(): # Tytuły if code == "en": title = base_title else: title = translate_section(base_title, lang_name, "title") save_output(OUTPUT_ROOT, "titles", code, sub_dir, base_filename, title) # Streszczenia if code == "en": summary = base_summary else: summary = translate_section(base_summary, lang_name, "summary") save_output(OUTPUT_ROOT, "summary", code, sub_dir, base_filename, summary) print(".", end="", flush=True) print(" OK") mark_as_done(str(rel_path)) def main(): input_path = Path(INPUT_DIR) if not input_path.exists(): print(f"❌ Brak folderu wejściowego: {INPUT_DIR}") return processed = load_history() print(f"📂 Historia: {len(processed)} plików już przetworzonych.") files = list(input_path.rglob("*.txt")) print(f"🚀 Start: {len(files)} plików do analizy.") for f in files: rel_path = str(f.relative_to(input_path)) if rel_path in processed: continue print(f"📄 Przetwarzam: {rel_path}") try: process_file(f, input_path) except KeyboardInterrupt: print("\n🛑 Przerwano ręcznie. Postęp zapisany.") break except Exception as e: print(f"\n❌ Błąd krytyczny przy pliku {rel_path}: {e}") if __name__ == "__main__": main()