File size: 5,656 Bytes
8fd4eb2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import os
import json
from pathlib import Path
from langchain_ollama import OllamaLLM
# --- KONFIGURACJA ---
INPUT_DIR = "synthetic_content"
OUTPUT_ROOT = "synthetic_dataset"
HISTORY_FILE = "processed_synthetic_scans_contents.txt"
MODEL_NAME = "llama3"
# Definicja języków
TARGET_LANGUAGES = {
"pl": "Polish",
"en": "English",
"de": "German",
"fr": "French",
"es": "Spanish",
"it": "Italian",
"uk": "Ukrainian"
}
# Inicjalizacja LLM z niską temperaturą dla powtarzalności
llm = OllamaLLM(model=MODEL_NAME, temperature=0)
# --- OBSŁUGA HISTORII (RESUME) ---
def load_history():
if not os.path.exists(HISTORY_FILE):
return set()
with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
return set(line.strip() for line in f if line.strip())
def mark_as_done(rel_path):
with open(HISTORY_FILE, 'a', encoding='utf-8') as f:
f.write(f"{rel_path}\n")
# --- PROMPTY LLM ---
def ask_llm_json(prompt):
"""Wywołuje LLM w trybie JSON i bezpiecznie parsuje wynik."""
try:
# format="json" to kluczowa funkcja Ollama, która wymusza poprawny JSON
response = llm.invoke(prompt, format="json")
return json.loads(response)
except json.JSONDecodeError as e:
print(f"\n ⚠️ Błąd składni JSON od AI: {e}")
return None
except Exception as e:
print(f"\n ⚠️ Błąd komunikacji z LLM: {e}")
return None
def ask_llm_text(prompt):
try:
response = llm.invoke(prompt)
return response.strip().strip('"').strip("'")
except Exception:
return "Translation Error"
def get_metadata(text, hinted_type):
# Prompt z wyraźnymi instrukcjami dla formatu JSON
prompt = f"""
Analyze this document text.
Folder hint: {hinted_type}
Return ONLY a JSON object with these keys:
- "title_base": Factual title in ENGLISH (format: "[Type] - [Entity] - [Date]")
- "summary_base": Factual summary in ENGLISH (exactly 5 sentences)
- "category": One of: financial, legal, personal, health, property, other
- "info": Key details (e.g. document ID or service name)
Ensure all quotes inside the text are properly escaped.
TEXT:
{text[:3500]}
"""
return ask_llm_json(prompt)
def translate_section(text, target_lang, content_type="text"):
prompt = f"""
Translate the following {content_type} into {target_lang}.
Output ONLY the translation. No conversational text or markdown.
TEXT TO TRANSLATE:
{text}
"""
return ask_llm_text(prompt)
def save_output(root, kind, lang, subdir, filename, content):
if lang:
path = Path(root) / kind / lang / subdir
else:
path = Path(root) / kind / subdir
path.mkdir(parents=True, exist_ok=True)
with open(path / filename, "w", encoding="utf-8") as f:
f.write(str(content))
# --- GŁÓWNA LOGIKA PLIKU ---
def process_file(file_path, input_root):
rel_path = file_path.relative_to(input_root)
base_filename = rel_path.name
sub_dir = rel_path.parent
doc_type = sub_dir.name
try:
raw_text = file_path.read_text(encoding='utf-8')
except Exception as e:
print(f" ❌ Błąd odczytu pliku: {e}")
return
# 2. Generowanie metadanych (JSON)
meta = get_metadata(raw_text, doc_type)
if not meta or not isinstance(meta, dict):
print(" ❌ Błąd AI: Nie udało się wygenerować poprawnego JSONa.")
return
# 3. Zapisywanie danych podstawowych
save_output(OUTPUT_ROOT, "content", None, sub_dir, base_filename, raw_text)
save_output(OUTPUT_ROOT, "category", None, sub_dir, base_filename, meta.get("category", "other"))
save_output(OUTPUT_ROOT, "type", None, sub_dir, base_filename, doc_type)
save_output(OUTPUT_ROOT, "info", None, sub_dir, base_filename, meta.get("info", "none"))
base_title = meta.get("title_base", "Document")
base_summary = meta.get("summary_base", "No summary available.")
# 4. Tłumaczenia
print(f" 🌍 Tłumaczenie na {len(TARGET_LANGUAGES)} języków...", end="", flush=True)
for code, lang_name in TARGET_LANGUAGES.items():
# Tytuły
if code == "en":
title = base_title
else:
title = translate_section(base_title, lang_name, "title")
save_output(OUTPUT_ROOT, "titles", code, sub_dir, base_filename, title)
# Streszczenia
if code == "en":
summary = base_summary
else:
summary = translate_section(base_summary, lang_name, "summary")
save_output(OUTPUT_ROOT, "summary", code, sub_dir, base_filename, summary)
print(".", end="", flush=True)
print(" OK")
mark_as_done(str(rel_path))
def main():
input_path = Path(INPUT_DIR)
if not input_path.exists():
print(f"❌ Brak folderu wejściowego: {INPUT_DIR}")
return
processed = load_history()
print(f"📂 Historia: {len(processed)} plików już przetworzonych.")
files = list(input_path.rglob("*.txt"))
print(f"🚀 Start: {len(files)} plików do analizy.")
for f in files:
rel_path = str(f.relative_to(input_path))
if rel_path in processed:
continue
print(f"📄 Przetwarzam: {rel_path}")
try:
process_file(f, input_path)
except KeyboardInterrupt:
print("\n🛑 Przerwano ręcznie. Postęp zapisany.")
break
except Exception as e:
print(f"\n❌ Błąd krytyczny przy pliku {rel_path}: {e}")
if __name__ == "__main__":
main() |