File size: 5,656 Bytes
8fd4eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import os
import json
from pathlib import Path
from langchain_ollama import OllamaLLM

# --- KONFIGURACJA ---
INPUT_DIR = "synthetic_content"       
OUTPUT_ROOT = "synthetic_dataset"     
HISTORY_FILE = "processed_synthetic_scans_contents.txt" 
MODEL_NAME = "llama3"

# Definicja języków
TARGET_LANGUAGES = {
    "pl": "Polish",
    "en": "English",
    "de": "German",
    "fr": "French",
    "es": "Spanish",
    "it": "Italian",
    "uk": "Ukrainian"
}

# Inicjalizacja LLM z niską temperaturą dla powtarzalności
llm = OllamaLLM(model=MODEL_NAME, temperature=0)

# --- OBSŁUGA HISTORII (RESUME) ---
def load_history():
    if not os.path.exists(HISTORY_FILE):
        return set()
    with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
        return set(line.strip() for line in f if line.strip())

def mark_as_done(rel_path):
    with open(HISTORY_FILE, 'a', encoding='utf-8') as f:
        f.write(f"{rel_path}\n")

# --- PROMPTY LLM ---
def ask_llm_json(prompt):
    """Wywołuje LLM w trybie JSON i bezpiecznie parsuje wynik."""
    try:
        # format="json" to kluczowa funkcja Ollama, która wymusza poprawny JSON
        response = llm.invoke(prompt, format="json")
        return json.loads(response)
    except json.JSONDecodeError as e:
        print(f"\n   ⚠️ Błąd składni JSON od AI: {e}")
        return None
    except Exception as e:
        print(f"\n   ⚠️ Błąd komunikacji z LLM: {e}")
        return None

def ask_llm_text(prompt):
    try:
        response = llm.invoke(prompt)
        return response.strip().strip('"').strip("'")
    except Exception:
        return "Translation Error"

def get_metadata(text, hinted_type):
    # Prompt z wyraźnymi instrukcjami dla formatu JSON
    prompt = f"""
    Analyze this document text.
    Folder hint: {hinted_type}

    Return ONLY a JSON object with these keys:
    - "title_base": Factual title in ENGLISH (format: "[Type] - [Entity] - [Date]")
    - "summary_base": Factual summary in ENGLISH (exactly 5 sentences)
    - "category": One of: financial, legal, personal, health, property, other
    - "info": Key details (e.g. document ID or service name)

    Ensure all quotes inside the text are properly escaped.
    
    TEXT:
    {text[:3500]}
    """
    return ask_llm_json(prompt)

def translate_section(text, target_lang, content_type="text"):
    prompt = f"""
    Translate the following {content_type} into {target_lang}.
    Output ONLY the translation. No conversational text or markdown.
    
    TEXT TO TRANSLATE:
    {text}
    """
    return ask_llm_text(prompt)

def save_output(root, kind, lang, subdir, filename, content):
    if lang:
        path = Path(root) / kind / lang / subdir
    else:
        path = Path(root) / kind / subdir
        
    path.mkdir(parents=True, exist_ok=True)
    with open(path / filename, "w", encoding="utf-8") as f:
        f.write(str(content))

# --- GŁÓWNA LOGIKA PLIKU ---
def process_file(file_path, input_root):
    rel_path = file_path.relative_to(input_root)
    base_filename = rel_path.name
    sub_dir = rel_path.parent
    doc_type = sub_dir.name

    try:
        raw_text = file_path.read_text(encoding='utf-8')
    except Exception as e:
        print(f"   ❌ Błąd odczytu pliku: {e}")
        return

    # 2. Generowanie metadanych (JSON)
    meta = get_metadata(raw_text, doc_type)
    if not meta or not isinstance(meta, dict):
        print("   ❌ Błąd AI: Nie udało się wygenerować poprawnego JSONa.")
        return

    # 3. Zapisywanie danych podstawowych
    save_output(OUTPUT_ROOT, "content", None, sub_dir, base_filename, raw_text)
    save_output(OUTPUT_ROOT, "category", None, sub_dir, base_filename, meta.get("category", "other"))
    save_output(OUTPUT_ROOT, "type", None, sub_dir, base_filename, doc_type)
    save_output(OUTPUT_ROOT, "info", None, sub_dir, base_filename, meta.get("info", "none"))

    base_title = meta.get("title_base", "Document")
    base_summary = meta.get("summary_base", "No summary available.")

    # 4. Tłumaczenia
    print(f"   🌍 Tłumaczenie na {len(TARGET_LANGUAGES)} języków...", end="", flush=True)
    
    for code, lang_name in TARGET_LANGUAGES.items():
        # Tytuły
        if code == "en": 
            title = base_title
        else: 
            title = translate_section(base_title, lang_name, "title")
        save_output(OUTPUT_ROOT, "titles", code, sub_dir, base_filename, title)

        # Streszczenia
        if code == "en": 
            summary = base_summary
        else: 
            summary = translate_section(base_summary, lang_name, "summary")
        save_output(OUTPUT_ROOT, "summary", code, sub_dir, base_filename, summary)

        print(".", end="", flush=True)

    print(" OK")
    mark_as_done(str(rel_path))

def main():
    input_path = Path(INPUT_DIR)
    if not input_path.exists():
        print(f"❌ Brak folderu wejściowego: {INPUT_DIR}")
        return

    processed = load_history()
    print(f"📂 Historia: {len(processed)} plików już przetworzonych.")

    files = list(input_path.rglob("*.txt"))
    print(f"🚀 Start: {len(files)} plików do analizy.")

    for f in files:
        rel_path = str(f.relative_to(input_path))
        
        if rel_path in processed:
            continue
            
        print(f"📄 Przetwarzam: {rel_path}")
        try:
            process_file(f, input_path)
        except KeyboardInterrupt:
            print("\n🛑 Przerwano ręcznie. Postęp zapisany.")
            break
        except Exception as e:
            print(f"\n❌ Błąd krytyczny przy pliku {rel_path}: {e}")

if __name__ == "__main__":
    main()